From e686f60ba2d0db39e7683da6ea6c665e514e65ab Mon Sep 17 00:00:00 2001
From: Less Wright <lessw@etrillium.com>
Date: Sun, 8 Jun 2025 08:39:46 -0700
Subject: [PATCH 01/34] Create cute_grouped_gemm.py

---
 .../kernels/blackwell/cute_grouped_gemm.py    | 2411 +++++++++++++++++
 1 file changed, 2411 insertions(+)
 create mode 100644 torchtitan/experiments/kernels/blackwell/cute_grouped_gemm.py

diff --git a/torchtitan/experiments/kernels/blackwell/cute_grouped_gemm.py b/torchtitan/experiments/kernels/blackwell/cute_grouped_gemm.py
new file mode 100644
index 000000000..743843acb
--- /dev/null
+++ b/torchtitan/experiments/kernels/blackwell/cute_grouped_gemm.py
@@ -0,0 +1,2411 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import argparse
+import functools
+from inspect import isclass
+from typing import List, Type, Union
+
+import cuda.bindings.driver as cuda
+
+import cutlass
+import cutlass.cute as cute
+import cutlass.torch as cutlass_torch
+import cutlass.utils as utils
+import cutlass.utils.blackwell_helpers as sm100_utils
+
+import torch
+from cutlass.cute.nvgpu import cpasync, tcgen05
+from cutlass.cute.runtime import from_dlpack
+
+"""
+A grouped GEMM example for the NVIDIA Blackwell SM100 architecture using CUTE DSL
+
+This example demonstrates an implementation of grouped GEMM using a TMA plus Blackwell SM100 TensorCore
+warp-specialized persistent kernel.
+The grouped GEMM workload computes a batch of GEMM operations with distinct problem sizes. Pointers to matrices
+in global memory are passed to the kernel in an array (also held in global memory). Similarly, problem shapes and
+strides are also stored in arrays in GMEM.
+
+This differs from "Batched Array" GEMM since the size of each GEMM problem in the grouped GEMM concept may be distinct.
+
+To run this example:
+
+.. code-block:: bash
+
+    python examples/blackwell/grouped_gemm.py                                                 \
+      --ab_dtype Float16 --c_dtype Float16 --acc_dtype Float32                                \
+      --mma_tiler_mn 128,64 --cluster_shape_mn 1,1                                            \
+      --problem_sizes_mnkl "(8192,1280,32,1),(16,384,1536,1),(640,1280,16,1),(640,160,16,1)"  \
+      --num_groups 4  --tensormap_update_mode SMEM
+
+The above example command makes 4 groups of different m, n, k sizes. The Blackwell tcgen05 MMA tile shape
+is specified as (128, 64) and the cluster shape is (1,1). The input, mma accumulator and output data type
+are set as fp16, fp32 and fp16, respectively.
+
+To collect performance with NCU profiler:
+
+.. code-block:: bash
+
+    ncu python examples/blackwell/grouped_gemm.py                                             \
+      --ab_dtype Float16 --c_dtype Float16 --acc_dtype Float32                                \
+      --mma_tiler_mn 128,64 --cluster_shape_mn 1,1                                            \
+      --problem_sizes_mnkl "(8192,1280,32,1),(16,384,1536,1),(640,1280,16,1),(640,160,16,1)"  \
+      --num_groups 4  --tensormap_update_mode SMEM                                            \
+      --warmup_iterations 1 --iterations 10 --skip_ref_check
+
+There are some constrains for this example. Besides the constrains from the Balckwell dense GEMM persistent example,
+there are also the following constrains:
+* Only fp16 and bf16 data types are supported as inputs.
+* Output data types could be fp16, bf16 or fp32.
+* The contiguous dimension of each tensor must be at least 16 bytes aligned.
+* The l mode(aka, batch size) for each group must be 1.
+* The majorness for A, B and C must be the same across all groups.
+"""
+
+
+class GroupedGemmKernel:
+
+    def __init__(
+        self,
+        acc_dtype: type[cutlass.Numeric],
+        use_2cta_instrs: bool,
+        mma_tiler_mn: tuple[int, int],
+        cluster_shape_mn: tuple[int, int],
+        tensormap_update_mode: utils.TensorMapUpdateMode = utils.TensorMapUpdateMode.SMEM,
+    ):
+        """Initializes the configuration for a Blackwell grouped GEMM kernel.
+
+        Besides configurations for dense persistent GEMM, there is an extra config specific to grouped GEMM:
+
+        Tensormap Update Mode:
+        - tensormap_update_mode: Specifies whether the tensormap is
+            updated in global memory(GMEM) or shared memory(SMEM).
+           The 2 modes are functionally equivalent and the difference are:
+            - We buffer 3 tensormaps in SMEM for A, B, and C tensors (each TMA descriptor takes 128B) when TMA updates performed on SMEM.
+            - Performance varies between modes depending on problem size; optimal choice differs across workloads.
+
+        :param acc_dtype: Data type of the accumulator.
+        :type acc_dtype: type[cutlass.Numeric]
+        :param use_2cta_instrs: Boolean, True to use cta_group=2 MMA variant.
+        :type use_2cta_instrs: bool
+        :param mma_tiler_mn: tuple (M, N) shape of the MMA instruction.
+        :type mma_tiler_mn: tuple[int, int]
+        :param cluster_shape_mn: tuple (ClusterM, ClusterN) shape of the cluster.
+        :type cluster_shape_mn: tuple[int, int]
+        :param tensormap_update_mode: Mode for updating the tensormap (GMEM or SMEM), defaults to SMEM.
+        :type tensormap_update_mode: utils.TensorMapUpdateMode, optional
+        """
+        self.acc_dtype: Type[cutlass.Numeric] = acc_dtype
+        self.use_2cta_instrs = use_2cta_instrs
+        self.cluster_shape_mn = cluster_shape_mn
+        # K dimension is deferred in _setup_attributes
+        self.mma_tiler = (*mma_tiler_mn, 1)
+        self.cta_group = (
+            tcgen05.CtaGroup.TWO if use_2cta_instrs else tcgen05.CtaGroup.ONE
+        )
+
+        self.tensormap_update_mode = tensormap_update_mode
+        # Delegate tensormap ab initialization to MMA warp when SMEM mode is used for better latency hiding
+        self.delegate_tensormap_ab_init = (
+            tensormap_update_mode == utils.TensorMapUpdateMode.SMEM
+        )
+
+        self.num_mcast_ctas_a = 1
+        self.num_mcast_ctas_b = 1
+        self.is_a_mcast = False
+        self.is_b_mcast = False
+
+        self.occupancy = 1
+        # Set specialized warp ids
+        self.epilog_warp_id = (
+            0,
+            1,
+            2,
+            3,
+        )
+        self.mma_warp_id = 4
+        self.tma_warp_id = 5
+        self.threads_per_cta = 32 * len(
+            (self.mma_warp_id, self.tma_warp_id, *self.epilog_warp_id)
+        )
+        # Set barrier id for cta sync, epilog sync, tmem ptr sync and tensormap update sync
+        self.cta_sync_bar_id = 0
+        self.epilog_sync_bar_id = 1
+        self.tmem_ptr_sync_bar_id = 2
+        # Barrier ID used by MMA/TMA warps to signal A/B tensormap initialization completion
+        self.tensormap_ab_init_bar_id = 4
+        self.num_smem_capacity = sm100_utils.SMEM_CAPACITY["sm100"]
+        self.num_tma_load_bytes = 0
+
+    def _setup_attributes(self):
+        """Set up configurations that are dependent on GEMM inputs
+
+        Most of the implementation follows standard dense GEMM patterns,
+        with the key difference being additional consideration for SMEM
+        buffer needed for tensormap updates.
+        """
+        # Configure tiled mma
+        tiled_mma = sm100_utils.make_trivial_tiled_mma(
+            self.a_dtype,
+            self.a_major_mode,
+            self.b_major_mode,
+            self.acc_dtype,
+            self.cta_group,
+            self.mma_tiler[:2],
+        )
+
+        # Compute mma/cluster/tile shapes
+        mma_inst_shape_k = cute.size(tiled_mma.shape_mnk, mode=[2])
+        mma_inst_tile_k = 4
+        self.mma_tiler = (
+            self.mma_tiler[0],
+            self.mma_tiler[1],
+            mma_inst_shape_k * mma_inst_tile_k,
+        )
+        self.cta_tile_shape_mnk = (
+            self.mma_tiler[0] // cute.size(tiled_mma.thr_id.shape),
+            self.mma_tiler[1],
+            self.mma_tiler[2],
+        )
+        self.cluster_tile_shape_mnk = tuple(
+            x * y for x, y in zip(self.cta_tile_shape_mnk, (*self.cluster_shape_mn, 1))
+        )
+
+        # Compute cluster layout
+        self.cluster_layout_vmnk = cute.tiled_divide(
+            cute.make_layout((*self.cluster_shape_mn, 1)),
+            (tiled_mma.thr_id.shape,),
+        )
+
+        # Compute number of multicast CTAs for A/B
+        self.num_mcast_ctas_a = cute.size(self.cluster_layout_vmnk.shape[2])
+        self.num_mcast_ctas_b = cute.size(self.cluster_layout_vmnk.shape[1])
+        self.is_a_mcast = self.num_mcast_ctas_a > 1
+        self.is_b_mcast = self.num_mcast_ctas_b > 1
+
+        # Compute epilogue subtile
+        self.epi_tile = utils.compute_epilogue_tile_shape(
+            self.cta_tile_shape_mnk,
+            self.use_2cta_instrs,
+            self.c_layout,
+            self.c_dtype,
+        )
+
+        # Setup A/B/C stage count in shared memory and ACC stage count in tensor memory
+        self.num_acc_stage, self.num_ab_stage, self.num_epi_stage = (
+            self._compute_stages(
+                tiled_mma,
+                self.mma_tiler,
+                self.a_dtype,
+                self.b_dtype,
+                self.epi_tile,
+                self.c_dtype,
+                self.c_layout,
+                self.num_smem_capacity,
+                self.occupancy,
+            )
+        )
+
+        self.a_smem_layout_staged = sm100_utils.make_smem_layout_a(
+            tiled_mma,
+            self.mma_tiler,
+            self.a_dtype,
+            self.num_ab_stage,
+        )
+        self.b_smem_layout_staged = sm100_utils.make_smem_layout_b(
+            tiled_mma,
+            self.mma_tiler,
+            self.b_dtype,
+            self.num_ab_stage,
+        )
+        self.epi_smem_layout_staged = sm100_utils.make_smem_layout_epi(
+            self.c_dtype,
+            self.c_layout,
+            self.epi_tile,
+            self.num_epi_stage,
+        )
+
+        tensor_smem_bytes = self._get_tensor_smem_bytes(
+            self.a_smem_layout_staged,
+            self.a_dtype,
+            self.b_smem_layout_staged,
+            self.b_dtype,
+            self.epi_smem_layout_staged,
+            self.c_dtype,
+        )
+        mbar_smem_bytes = self._get_mbar_smem_bytes(
+            num_acc_stage=self.num_acc_stage,
+            num_ab_stage=self.num_ab_stage,
+            num_epi_stage=self.num_epi_stage,
+        )
+        tensormap_smem_bytes = self._get_tensormap_smem_bytes(
+            self.tensormap_update_mode
+        )
+        if (
+            mbar_smem_bytes
+            + tensormap_smem_bytes
+            + GroupedGemmKernel.tensor_memory_management_bytes
+            > self.reserved_smem_bytes
+        ):
+            raise ValueError(
+                f"smem consumption for mbar and tensormap {mbar_smem_bytes + tensormap_smem_bytes} exceeds the "
+                f"reserved smem bytes {self.reserved_smem_bytes}"
+            )
+
+        # Compute the number of tensor memory allocation columns
+        self.num_tmem_alloc_cols = self._compute_num_tmem_alloc_cols(
+            tiled_mma, self.mma_tiler, self.num_acc_stage
+        )
+
+    @cute.jit
+    def __call__(
+        self,
+        initial_a: cute.Tensor,
+        initial_b: cute.Tensor,
+        initial_c: cute.Tensor,
+        group_count: cutlass.Constexpr[int],
+        problem_shape_mnkl: cute.Tensor,
+        strides_abc: cute.Tensor,
+        tensor_address_abc: cute.Tensor,
+        total_num_clusters: cutlass.Constexpr[int],
+        tensormap_cute_tensor: cute.Tensor,
+        max_active_clusters: cutlass.Constexpr[int],
+        stream: cuda.CUstream,
+    ):
+        """Execute the GEMM operation in steps:
+        - Setup static attributes before smem/grid/tma computation
+        - Setup TMA load/store atoms and tensors
+        - Compute grid size with regard to hardware constraints
+        - Define shared storage for kernel
+        - Launch the kernel synchronously
+
+        For grouped GEMM, tensor shapes, tensor strides, and tensor address are all provided
+        by different tensors in global memory. The "initial" tensors only carry data type and
+        majorness information.
+
+        :param initial_a: Initial tensor A, used for data type and majorness information.
+        :type initial_a: cute.Tensor
+        :param initial_b: Initial tensor B, used for data type and majorness information.
+        :type initial_b: cute.Tensor
+        :param initial_c: Initial tensor C, used for data type and majorness information.
+        :type initial_c: cute.Tensor
+        :param group_count: The number of GEMM groups.
+        :type group_count: cutlass.Constexpr[int]
+        :param problem_shape_mnkl: Tensor containing the (M, N, K, L) shape for each group.
+        :type problem_shape_mnkl: cute.Tensor
+        :param strides_abc: Tensor containing the strides for A, B, and C for each group.
+        :type strides_abc: cute.Tensor
+        :param tensor_address_abc: Tensor containing the base addresses for A, B, and C for each group.
+        :type tensor_address_abc: cute.Tensor
+        :param total_num_clusters: Total number of clusters needed for all groups.
+        :type total_num_clusters: cutlass.Constexpr[int]
+        :param tensormap_cute_tensor: Tensor for storing tensormaps.
+        :type tensormap_cute_tensor: cute.Tensor
+        :param max_active_clusters: Maximum number of active clusters.
+        :type max_active_clusters: cutlass.Constexpr[int]
+        :param stream: CUDA stream for asynchronous execution.
+        :type stream: cuda.CUstream
+        :raises TypeError: If A and B data types do not match.
+        """
+        self.a_dtype = initial_a.element_type
+        self.b_dtype = initial_b.element_type
+        self.c_dtype = initial_c.element_type
+        self.a_major_mode = utils.LayoutEnum.from_tensor(initial_a).mma_major_mode()
+        self.b_major_mode = utils.LayoutEnum.from_tensor(initial_b).mma_major_mode()
+        self.c_layout = utils.LayoutEnum.from_tensor(initial_c)
+        if cutlass.const_expr(self.a_dtype != self.b_dtype):
+            raise TypeError(f"Type mismatch: {self.a_dtype} != {self.b_dtype}")
+
+        # Setup attributes that dependent on gemm inputs
+        self._setup_attributes()
+
+        tiled_mma = sm100_utils.make_trivial_tiled_mma(
+            self.a_dtype,
+            self.a_major_mode,
+            self.b_major_mode,
+            self.acc_dtype,
+            self.cta_group,
+            self.mma_tiler[:2],
+        )
+        atom_thr_size = cute.size(tiled_mma.thr_id.shape)
+
+        # Setup TMA load for A
+        a_op = self._get_tma_atom_kind(atom_thr_size, self.is_a_mcast)
+        a_smem_layout = cute.slice_(self.a_smem_layout_staged, (None, None, None, 0))
+        tma_atom_a, tma_tensor_a = cute.nvgpu.make_tma_tile_atom_A(
+            a_op,
+            initial_a,
+            a_smem_layout,
+            self.mma_tiler,
+            tiled_mma,
+            self.cluster_layout_vmnk.shape,
+        )
+
+        # Setup TMA load for B
+        b_op = self._get_tma_atom_kind(atom_thr_size, self.is_b_mcast)
+        b_smem_layout = cute.slice_(self.b_smem_layout_staged, (None, None, None, 0))
+        tma_atom_b, tma_tensor_b = cute.nvgpu.make_tma_tile_atom_B(
+            b_op,
+            initial_b,
+            b_smem_layout,
+            self.mma_tiler,
+            tiled_mma,
+            self.cluster_layout_vmnk.shape,
+        )
+
+        a_copy_size = cute.size_in_bytes(self.a_dtype, a_smem_layout)
+        b_copy_size = cute.size_in_bytes(self.b_dtype, b_smem_layout)
+        self.num_tma_load_bytes = (a_copy_size + b_copy_size) * atom_thr_size
+
+        # Setup TMA store for C
+        tma_atom_c = None
+        tma_tensor_c = None
+        c_cta_v_layout = cute.composition(
+            cute.make_identity_layout(initial_c.shape), self.epi_tile
+        )
+        epi_smem_layout = cute.slice_(self.epi_smem_layout_staged, (None, None, 0))
+        tma_atom_c, tma_tensor_c = cpasync.make_tma_tile_atom(
+            cpasync.CopyBulkTensorTileS2GOp(),
+            initial_c,
+            epi_smem_layout,
+            c_cta_v_layout,
+        )
+
+        self.tile_sched_params, grid = self._compute_grid(
+            total_num_clusters, self.cluster_shape_mn, max_active_clusters
+        )
+
+        self.buffer_align_bytes = 1024
+        self.size_tensormap_in_i64 = (
+            0
+            if cutlass.const_expr(
+                self.tensormap_update_mode == utils.TensorMapUpdateMode.GMEM
+            )
+            else GroupedGemmKernel.num_tensormaps
+            * GroupedGemmKernel.bytes_per_tensormap
+            // 8
+        )
+
+        # Define shared storage for kernel
+        @cute.struct
+        class SharedStorage:
+            tensormap_buffer: cute.struct.MemRange[
+                cutlass.Int64, self.size_tensormap_in_i64
+            ]
+            ab_full_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_ab_stage]
+            ab_empty_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_ab_stage]
+            acc_full_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_acc_stage]
+            acc_empty_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_acc_stage]
+            tmem_dealloc_mbar_ptr: cutlass.Int64
+            tmem_holding_buf: cutlass.Int32
+            # (EPI_TILE_M, EPI_TILE_N, STAGE)
+            sC: cute.struct.Align[
+                cute.struct.MemRange[
+                    self.c_dtype,
+                    cute.cosize(self.epi_smem_layout_staged.outer),
+                ],
+                self.buffer_align_bytes,
+            ]
+            # (MMA, MMA_M, MMA_K, STAGE)
+            sA: cute.struct.Align[
+                cute.struct.MemRange[
+                    self.a_dtype, cute.cosize(self.a_smem_layout_staged.outer)
+                ],
+                self.buffer_align_bytes,
+            ]
+            # (MMA, MMA_N, MMA_K, STAGE)
+            sB: cute.struct.Align[
+                cute.struct.MemRange[
+                    self.b_dtype, cute.cosize(self.b_smem_layout_staged.outer)
+                ],
+                self.buffer_align_bytes,
+            ]
+
+        self.shared_storage = SharedStorage
+
+        # Launch the kernel synchronously
+        self.kernel(
+            tiled_mma,
+            tma_atom_a,
+            tma_tensor_a,
+            tma_atom_b,
+            tma_tensor_b,
+            tma_atom_c,
+            tma_tensor_c,
+            self.cluster_layout_vmnk,
+            self.a_smem_layout_staged,
+            self.b_smem_layout_staged,
+            self.epi_smem_layout_staged,
+            self.epi_tile,
+            self.tile_sched_params,
+            group_count,
+            problem_shape_mnkl,
+            strides_abc,
+            tensor_address_abc,
+            tensormap_cute_tensor,
+        ).launch(
+            grid=grid,
+            block=[self.threads_per_cta, 1, 1],
+            cluster=(*self.cluster_shape_mn, 1),
+            smem=self.shared_storage.size_in_bytes(),
+            stream=stream,
+        )
+        return
+
+    # GPU device kernel
+    @cute.kernel
+    def kernel(
+        self,
+        tiled_mma: cute.TiledMma,
+        tma_atom_a: cute.CopyAtom,
+        mA_mkl: cute.Tensor,
+        tma_atom_b: cute.CopyAtom,
+        mB_nkl: cute.Tensor,
+        tma_atom_c: cute.CopyAtom,
+        mC_mnl: cute.Tensor,
+        cluster_layout_vmnk: cute.Layout,
+        a_smem_layout_staged: cute.ComposedLayout,
+        b_smem_layout_staged: cute.ComposedLayout,
+        epi_smem_layout_staged: Union[cute.Layout, cute.ComposedLayout],
+        epi_tile: cute.Tile,
+        tile_sched_params: utils.PersistentTileSchedulerParams,
+        group_count: cutlass.Constexpr[int],
+        problem_sizes_mnkl: cute.Tensor,
+        strides_abc: cute.Tensor,
+        ptrs_abc: cute.Tensor,
+        tensormaps: cute.Tensor,
+    ):
+        """
+        GPU device kernel performing the grouped GEMM computation.
+        """
+        warp_idx = cute.arch.warp_idx()
+        warp_idx = cute.arch.make_warp_uniform(warp_idx)
+
+        #
+        # Prefetch tma desc
+        #
+        if warp_idx == self.tma_warp_id:
+            cpasync.prefetch_descriptor(tma_atom_a)
+            cpasync.prefetch_descriptor(tma_atom_b)
+            cpasync.prefetch_descriptor(tma_atom_c)
+
+        use_2cta_instrs = cute.size(tiled_mma.thr_id.shape) == 2
+
+        #
+        # Setup cta/thread coordinates
+        #
+        # Coord inside cluster
+        bid = cute.arch.block_idx()
+        mma_tile_coord_v = bid[0] % cute.size(tiled_mma.thr_id.shape)
+        is_leader_cta = mma_tile_coord_v == 0
+        cta_rank_in_cluster = cute.arch.make_warp_uniform(
+            cute.arch.block_idx_in_cluster()
+        )
+        block_in_cluster_coord_vmnk = cluster_layout_vmnk.get_flat_coord(
+            cta_rank_in_cluster
+        )
+        # Coord inside cta
+        tidx, _, _ = cute.arch.thread_idx()
+
+        #
+        # Alloc and init: tensormap buffer, a+b full/empty, accumulator full/empty, tensor memory dealloc barrier
+        #
+        smem = utils.SmemAllocator()
+        storage = smem.allocate(self.shared_storage)
+
+        tensormap_a_smem_ptr = None
+        tensormap_b_smem_ptr = None
+        tensormap_c_smem_ptr = None
+        if cutlass.const_expr(
+            self.tensormap_update_mode == utils.TensorMapUpdateMode.SMEM
+        ):
+            tensormap_smem_ptr = storage.tensormap_buffer.data_ptr()
+            tensormap_a_smem_ptr = tensormap_smem_ptr
+            tensormap_b_smem_ptr = (
+                tensormap_a_smem_ptr + GroupedGemmKernel.bytes_per_tensormap // 8
+            )
+            tensormap_c_smem_ptr = (
+                tensormap_b_smem_ptr + GroupedGemmKernel.bytes_per_tensormap // 8
+            )
+        ab_full_mbar_ptr = storage.ab_full_mbar_ptr.data_ptr()
+        ab_empty_mbar_ptr = storage.ab_empty_mbar_ptr.data_ptr()
+        acc_full_mbar_ptr = storage.acc_full_mbar_ptr.data_ptr()
+        acc_empty_mbar_ptr = storage.acc_empty_mbar_ptr.data_ptr()
+        tmem_dealloc_mbar_ptr = storage.tmem_dealloc_mbar_ptr
+        tmem_holding_buf = storage.tmem_holding_buf
+
+        #  init barrier for loading A, B with TMA
+        if warp_idx == self.epilog_warp_id[0]:
+            for k_stage in range(self.num_ab_stage):
+                num_tma_producer = self.num_mcast_ctas_a + self.num_mcast_ctas_b - 1
+                with cute.arch.elect_one():
+                    cute.arch.mbarrier_init_arrive_cnt(ab_full_mbar_ptr + k_stage, 1)
+                    cute.arch.mbarrier_init_arrive_cnt(
+                        ab_empty_mbar_ptr + k_stage, num_tma_producer
+                    )
+        # Accumulator barrier init
+        if warp_idx == self.mma_warp_id:
+            for acc_stage in range(self.num_acc_stage):
+                with cute.arch.elect_one():
+                    cute.arch.mbarrier_init_arrive_cnt(acc_full_mbar_ptr + acc_stage, 1)
+                    cute.arch.mbarrier_init_arrive_cnt(
+                        acc_empty_mbar_ptr + acc_stage, 8 if use_2cta_instrs else 4
+                    )
+        # Tensor memory dealloc barrier init
+        if use_2cta_instrs:
+            if warp_idx == self.tma_warp_id:
+                num_tmem_dealloc_threads = 32
+                with cute.arch.elect_one():
+                    cute.arch.mbarrier_init_arrive_cnt(
+                        tmem_dealloc_mbar_ptr, num_tmem_dealloc_threads
+                    )
+        cute.arch.mbarrier_init_fence()
+
+        # Cluster arrive after barrier init
+        if cute.size(self.cluster_shape_mn) > 1:
+            cute.arch.cluster_arrive_relaxed()
+
+        #
+        # Setup smem tensor A/B/C
+        #
+        # (EPI_TILE_M, EPI_TILE_N, STAGE)
+        sC = storage.sC.get_tensor(
+            epi_smem_layout_staged.outer, swizzle=epi_smem_layout_staged.inner
+        )
+        # (MMA, MMA_M, MMA_K, STAGE)
+        sA = storage.sA.get_tensor(
+            a_smem_layout_staged.outer, swizzle=a_smem_layout_staged.inner
+        )
+        # (MMA, MMA_N, MMA_K, STAGE)
+        sB = storage.sB.get_tensor(
+            b_smem_layout_staged.outer, swizzle=b_smem_layout_staged.inner
+        )
+
+        #
+        # Compute multicast mask for A/B buffer full and empty
+        #
+        a_full_mcast_mask = None
+        b_full_mcast_mask = None
+        ab_empty_mcast_mask = None
+        if self.is_a_mcast or self.is_b_mcast or use_2cta_instrs:
+            a_full_mcast_mask = cpasync.create_tma_multicast_mask(
+                cluster_layout_vmnk, block_in_cluster_coord_vmnk, mcast_mode=2
+            )
+            b_full_mcast_mask = cpasync.create_tma_multicast_mask(
+                cluster_layout_vmnk, block_in_cluster_coord_vmnk, mcast_mode=1
+            )
+            ab_empty_mcast_mask = a_full_mcast_mask | b_full_mcast_mask
+        acc_full_mcast_mask = None
+        if use_2cta_instrs:
+            acc_full_mcast_mask = cute.make_layout_image_mask(
+                cluster_layout_vmnk, block_in_cluster_coord_vmnk, mode=0
+            )
+            block_in_cluster_coord_vmnk_peer = (
+                block_in_cluster_coord_vmnk[0] ^ 1,
+                *block_in_cluster_coord_vmnk[1:],
+            )
+            a_full_mcast_mask_peer = cpasync.create_tma_multicast_mask(
+                cluster_layout_vmnk, block_in_cluster_coord_vmnk_peer, mcast_mode=2
+            )
+            b_full_mcast_mask_peer = cpasync.create_tma_multicast_mask(
+                cluster_layout_vmnk, block_in_cluster_coord_vmnk_peer, mcast_mode=1
+            )
+            ab_empty_mcast_mask = (
+                a_full_mcast_mask_peer
+                | b_full_mcast_mask_peer
+                | cutlass.Int16(
+                    0 if ab_empty_mcast_mask is None else ab_empty_mcast_mask
+                )
+            )
+
+        #
+        # Local_tile partition global tensors
+        #
+        # (bM, bK, loopM, loopK, loopL)
+        gA_mkl = cute.local_tile(
+            mA_mkl, cute.slice_(self.mma_tiler, (None, 0, None)), (None, None, None)
+        )
+        # (bN, bK, loopN, loopK, loopL)
+        gB_nkl = cute.local_tile(
+            mB_nkl, cute.slice_(self.mma_tiler, (0, None, None)), (None, None, None)
+        )
+        # (bM, bN, loopM, loopN, loopL)
+        gC_mnl = cute.local_tile(
+            mC_mnl, cute.slice_(self.mma_tiler, (None, None, 0)), (None, None, None)
+        )
+
+        #
+        # Partition global tensor for TiledMMA_A/B/C
+        #
+        thr_mma = tiled_mma.get_slice(mma_tile_coord_v)
+        # (MMA, MMA_M, MMA_K, loopM, loopK, loopL)
+        tCgA = thr_mma.partition_A(gA_mkl)
+        # (MMA, MMA_N, MMA_K, loopN, loopK, loopL)
+        tCgB = thr_mma.partition_B(gB_nkl)
+        # (MMA, MMA_M, MMA_N, loopM, loopN, loopL)
+        tCgC = thr_mma.partition_C(gC_mnl)
+
+        #
+        # Partition global/shared tensor for load A, B with TMA
+        #
+        a_cta_layout = cute.make_layout(
+            cute.slice_(cluster_layout_vmnk, (0, 0, None, 0)).shape
+        )
+        # ((atom_v, rest_v), STAGE)
+        # ((atom_v, rest_v), loopM, loopK, loopL)
+        tAsA, tAgA = cpasync.tma_partition(
+            tma_atom_a,
+            block_in_cluster_coord_vmnk[2],
+            a_cta_layout,
+            cute.group_modes(sA, 0, 3),
+            cute.group_modes(tCgA, 0, 3),
+        )
+        # TMA load B partition_S/D
+        b_cta_layout = cute.make_layout(
+            cute.slice_(cluster_layout_vmnk, (0, None, 0, 0)).shape
+        )
+        # ((atom_v, rest_v), STAGE)
+        # ((atom_v, rest_v), loopM, loopK, loopL)
+        tBsB, tBgB = cpasync.tma_partition(
+            tma_atom_b,
+            block_in_cluster_coord_vmnk[1],
+            b_cta_layout,
+            cute.group_modes(sB, 0, 3),
+            cute.group_modes(tCgB, 0, 3),
+        )
+
+        #
+        # Partition shared/tensor memory tensor for TiledMMA_A/B/C
+        #
+        # (MMA, MMA_M, MMA_K, STAGE)
+        tCrA = tiled_mma.make_fragment_A(sA)
+        # (MMA, MMA_N, MMA_K, STAGE)
+        tCrB = tiled_mma.make_fragment_B(sB)
+        # (MMA, MMA_M, MMA_N)
+        acc_shape = tiled_mma.partition_shape_C(self.mma_tiler[:2])
+        # (MMA, MMA_M, MMA_N, STAGE)
+        tCtAcc_fake = tiled_mma.make_fragment_C(
+            cute.append(acc_shape, self.num_acc_stage)
+        )
+
+        #
+        # Cluster wait before tensor memory alloc
+        #
+        if cute.size(self.cluster_shape_mn) > 1:
+            cute.arch.cluster_wait()
+        else:
+            cute.arch.barrier(
+                barrier_id=self.cta_sync_bar_id, number_of_threads=self.threads_per_cta
+            )
+
+        #
+        # Get tensormap buffer address
+        #
+        grid_dim = cute.arch.grid_dim()
+        tensormap_workspace_idx = (
+            bid[2] * grid_dim[1] * grid_dim[0] + bid[1] * grid_dim[0] + bid[0]
+        )
+
+        tensormap_manager = utils.TensorMapManager(
+            self.tensormap_update_mode, GroupedGemmKernel.bytes_per_tensormap
+        )
+        tensormap_a_ptr = tensormap_manager.get_tensormap_ptr(
+            tensormaps[(tensormap_workspace_idx, 0, None)].iterator
+        )
+        tensormap_b_ptr = tensormap_manager.get_tensormap_ptr(
+            tensormaps[(tensormap_workspace_idx, 1, None)].iterator
+        )
+        tensormap_c_ptr = tensormap_manager.get_tensormap_ptr(
+            tensormaps[(tensormap_workspace_idx, 2, None)].iterator
+        )
+        # Setup tensormap initialization pointer based on the mode
+        if cutlass.const_expr(
+            self.tensormap_update_mode == utils.TensorMapUpdateMode.SMEM
+        ):
+            tensormap_a_init_ptr = tensormap_a_smem_ptr
+            tensormap_b_init_ptr = tensormap_b_smem_ptr
+            tensormap_c_init_ptr = tensormap_c_smem_ptr
+        else:
+            tensormap_a_init_ptr = tensormap_a_ptr
+            tensormap_b_init_ptr = tensormap_b_ptr
+            tensormap_c_init_ptr = tensormap_c_ptr
+
+        #
+        # Specialized TMA load warp
+        #
+        if warp_idx == self.tma_warp_id:
+            # Initialize tensormaps for A, B
+            if cutlass.const_expr(self.delegate_tensormap_ab_init == False):
+                tensormap_manager.init_tensormap_from_atom(
+                    tma_atom_a, tensormap_a_init_ptr, self.tma_warp_id
+                )
+                tensormap_manager.init_tensormap_from_atom(
+                    tma_atom_b, tensormap_b_init_ptr, self.tma_warp_id
+                )
+            #
+            # Persistent tile scheduling loop
+            #
+            tile_sched = utils.StaticPersistentTileScheduler.create(
+                tile_sched_params, bid, grid_dim
+            )
+            # grouped gemm tile scheduler helper will compute the group index for the tile we're working on
+            group_gemm_ts_helper = utils.GroupedGemmTileSchedulerHelper(
+                group_count,
+                tile_sched_params,
+                self.cluster_tile_shape_mnk,
+                utils.create_initial_search_state(),
+            )
+            tensormap_init_done = cutlass.Boolean(False)
+            # tile count we have searched
+            total_k_block_cnt = cutlass.Int32(0)
+            # group index of last tile
+            last_group_idx = cutlass.Int32(-1)
+            work_tile = tile_sched.initial_work_tile_info()
+            while work_tile.is_valid_tile:
+                cur_tile_coord = work_tile.tile_idx
+                grouped_gemm_cta_tile_info = group_gemm_ts_helper.delinearize_z(
+                    cur_tile_coord,
+                    problem_sizes_mnkl,
+                )
+                cur_k_block_cnt = grouped_gemm_cta_tile_info.cta_tile_count_k
+                cur_group_idx = grouped_gemm_cta_tile_info.group_idx
+                is_group_changed = cur_group_idx != last_group_idx
+                # skip tensormap update if we're working on the same group
+                if is_group_changed:
+                    real_tensor_a = self.make_tensor_for_tensormap_update(
+                        cur_group_idx,
+                        self.a_dtype,
+                        (
+                            grouped_gemm_cta_tile_info.problem_shape_m,
+                            grouped_gemm_cta_tile_info.problem_shape_n,
+                            grouped_gemm_cta_tile_info.problem_shape_k,
+                        ),
+                        strides_abc,
+                        ptrs_abc,
+                        0,  # 0 for tensor A
+                    )
+                    real_tensor_b = self.make_tensor_for_tensormap_update(
+                        cur_group_idx,
+                        self.b_dtype,
+                        (
+                            grouped_gemm_cta_tile_info.problem_shape_m,
+                            grouped_gemm_cta_tile_info.problem_shape_n,
+                            grouped_gemm_cta_tile_info.problem_shape_k,
+                        ),
+                        strides_abc,
+                        ptrs_abc,
+                        1,  # 1 for tensor B
+                    )
+                    # wait tensormap initialization complete before update
+                    if tensormap_init_done == False:
+                        if cutlass.const_expr(self.delegate_tensormap_ab_init):
+                            cute.arch.barrier(
+                                barrier_id=self.tensormap_ab_init_bar_id,
+                                number_of_threads=64,
+                            )
+                        tensormap_manager.fence_tensormap_initialization()
+                        tensormap_init_done = True
+
+                    tensormap_manager.update_tensormap(
+                        (real_tensor_a, real_tensor_b),
+                        (tma_atom_a, tma_atom_b),
+                        (tensormap_a_ptr, tensormap_b_ptr),
+                        self.tma_warp_id,
+                        (tensormap_a_smem_ptr, tensormap_b_smem_ptr),
+                    )
+
+                mma_tile_coord_mnl = (
+                    grouped_gemm_cta_tile_info.cta_tile_idx_m
+                    // cute.size(tiled_mma.thr_id.shape),
+                    grouped_gemm_cta_tile_info.cta_tile_idx_n,
+                    0,
+                )
+
+                #
+                # Slice to per mma tile index
+                #
+                # ((atom_v, rest_v), loopK)
+                tAgA_slice = tAgA[
+                    (None, mma_tile_coord_mnl[0], None, mma_tile_coord_mnl[2])
+                ]
+                # ((atom_v, rest_v), loopK)
+                tBgB_slice = tBgB[
+                    (None, mma_tile_coord_mnl[1], None, mma_tile_coord_mnl[2])
+                ]
+
+                num_prev_k_blk = total_k_block_cnt
+                total_k_block_cnt += cur_k_block_cnt
+
+                # Peek (try_wait) AB buffer empty for k_block = prefetch_k_block_cnt
+                tma_wr_k_block = cutlass.Int32(0)
+                smem_wr_buffer = (num_prev_k_blk + tma_wr_k_block) % self.num_ab_stage
+                tma_wr_ab_empty_phase = (
+                    num_prev_k_blk + tma_wr_k_block
+                ) // self.num_ab_stage % 2 ^ 1
+                peek_ab_empty_status = cute.arch.conditional_mbarrier_try_wait(
+                    tma_wr_k_block < cur_k_block_cnt,
+                    ab_empty_mbar_ptr + smem_wr_buffer,
+                    tma_wr_ab_empty_phase,
+                )
+                # ensure the update to tensormap has completed before using it
+                if is_group_changed:
+                    tensormap_manager.fence_tensormap_update(tensormap_a_ptr)
+                    tensormap_manager.fence_tensormap_update(tensormap_b_ptr)
+                #
+                # Tma load loop
+                #
+                for k_block in cutlass.range_dynamic(0, cur_k_block_cnt, 1, unroll=1):
+                    tma_wr_k_block_next = tma_wr_k_block + 1
+                    smem_wr_buffer_next = (
+                        num_prev_k_blk + tma_wr_k_block_next
+                    ) % self.num_ab_stage
+                    tma_wr_ab_empty_phase_next = (
+                        tma_wr_ab_empty_phase ^ 1
+                        if smem_wr_buffer_next == 0
+                        else tma_wr_ab_empty_phase
+                    )
+
+                    smem_full_mbar_ptr = ab_full_mbar_ptr + smem_wr_buffer
+
+                    # Wait for AB buffer empty
+                    if peek_ab_empty_status == 0:
+                        cute.arch.mbarrier_wait(
+                            ab_empty_mbar_ptr + smem_wr_buffer, tma_wr_ab_empty_phase
+                        )
+
+                    # Init AB buffer full transaction byte
+                    if is_leader_cta:
+                        with cute.arch.elect_one():
+                            cute.arch.mbarrier_init_tx_bytes(
+                                smem_full_mbar_ptr, self.num_tma_load_bytes
+                            )
+
+                    # Load A/B with TMA
+                    cute.copy(
+                        tma_atom_a,
+                        tAgA_slice[(None, tma_wr_k_block)],
+                        tAsA[(None, smem_wr_buffer)],
+                        tma_bar_ptr=smem_full_mbar_ptr,
+                        mcast_mask=a_full_mcast_mask,
+                        tma_desc_ptr=tensormap_manager.get_tensormap_ptr(
+                            tensormap_a_ptr,
+                            cute.AddressSpace.generic,
+                        ),
+                    )
+                    cute.copy(
+                        tma_atom_b,
+                        tBgB_slice[(None, tma_wr_k_block)],
+                        tBsB[(None, smem_wr_buffer)],
+                        tma_bar_ptr=smem_full_mbar_ptr,
+                        mcast_mask=b_full_mcast_mask,
+                        tma_desc_ptr=tensormap_manager.get_tensormap_ptr(
+                            tensormap_b_ptr,
+                            cute.AddressSpace.generic,
+                        ),
+                    )
+
+                    # Peek (try_wait) AB buffer empty for k_block = prefetch_k_block_cnt + k_block + 1
+                    peek_ab_empty_status = cute.arch.conditional_mbarrier_try_wait(
+                        tma_wr_k_block_next < cur_k_block_cnt,
+                        ab_empty_mbar_ptr + smem_wr_buffer_next,
+                        tma_wr_ab_empty_phase_next,
+                    )
+
+                    tma_wr_k_block = tma_wr_k_block_next
+                    smem_wr_buffer = smem_wr_buffer_next
+                    tma_wr_ab_empty_phase = tma_wr_ab_empty_phase_next
+
+                # Advance to next tile
+                tile_sched.advance_to_next_work()
+                work_tile = tile_sched.get_current_work()
+                last_group_idx = cur_group_idx
+
+        #
+        # Specialized MMA warp
+        #
+        if warp_idx == self.mma_warp_id:
+            # initilize tensormap A, B for TMA warp
+            if cutlass.const_expr(self.delegate_tensormap_ab_init):
+                tensormap_manager.init_tensormap_from_atom(
+                    tma_atom_a, tensormap_a_init_ptr, self.mma_warp_id
+                )
+                tensormap_manager.init_tensormap_from_atom(
+                    tma_atom_b, tensormap_b_init_ptr, self.mma_warp_id
+                )
+                # signal tensormap initialization has finished
+                cute.arch.barrier(
+                    barrier_id=self.tensormap_ab_init_bar_id, number_of_threads=64
+                )
+            #  Bar sync for retrieve tmem ptr from shared mem
+            tmem_ptr_read_threads = 32 * len((self.mma_warp_id, *self.epilog_warp_id))
+            cute.arch.barrier(
+                barrier_id=self.tmem_ptr_sync_bar_id,
+                number_of_threads=tmem_ptr_read_threads,
+            )
+
+            #
+            # Retrieving tensor memory ptr and make accumulator tensor
+            #
+            tmem_ptr = cute.arch.retrieve_tmem_ptr(
+                self.acc_dtype,
+                alignment=16,
+                ptr_to_buffer_holding_addr=tmem_holding_buf,
+            )
+            # (MMA, MMA_M, MMA_N, STAGE)
+            tCtAcc_base = cute.make_tensor(tmem_ptr, tCtAcc_fake.layout)
+
+            #
+            # Persistent tile scheduling loop
+            #
+            tile_sched = utils.StaticPersistentTileScheduler.create(
+                tile_sched_params, bid, grid_dim
+            )
+            # grouped gemm tile scheduler helper will compute the group index for the tile we're working on
+            group_gemm_ts_helper = utils.GroupedGemmTileSchedulerHelper(
+                group_count,
+                tile_sched_params,
+                self.cluster_tile_shape_mnk,
+                utils.create_initial_search_state(),
+            )
+
+            work_tile = tile_sched.initial_work_tile_info()
+            # tile count we have searched
+            total_k_block_cnt = cutlass.Int32(0)
+            while work_tile.is_valid_tile:
+                cur_tile_coord = work_tile.tile_idx
+                # MMA warp is only interested in number of tiles along K dimension
+                cur_k_block_cnt, cur_group_idx = (
+                    group_gemm_ts_helper.search_cluster_tile_count_k(
+                        cur_tile_coord,
+                        problem_sizes_mnkl,
+                    )
+                )
+                # Set tensor memory buffer for current tile
+                acc_buf_idx = tile_sched.num_tiles_executed % self.num_acc_stage
+                # (MMA, MMA_M, MMA_N)
+                tCtAcc = tCtAcc_base[(None, None, None, acc_buf_idx)]
+
+                num_prev_k_blk = total_k_block_cnt
+                total_k_block_cnt += cur_k_block_cnt
+
+                # Peek (try_wait) AB buffer full for k_block = 0
+                mma_rd_k_block = cutlass.Int32(0)
+                smem_rd_buffer = (num_prev_k_blk + mma_rd_k_block) % self.num_ab_stage
+                need_check_rd_buffer_full = (
+                    mma_rd_k_block < cur_k_block_cnt and is_leader_cta
+                )
+                mma_rd_ab_full_phase = (
+                    (num_prev_k_blk + mma_rd_k_block) // self.num_ab_stage % 2
+                )
+                peek_ab_full_status = cute.arch.conditional_mbarrier_try_wait(
+                    need_check_rd_buffer_full,
+                    ab_full_mbar_ptr + smem_rd_buffer,
+                    mma_rd_ab_full_phase,
+                )
+
+                #
+                # Wait for accumulator buffer empty
+                #
+                if is_leader_cta:
+                    acc_empty_phase = (
+                        tile_sched.num_tiles_executed // self.num_acc_stage % 2 ^ 1
+                    )
+                    cute.arch.mbarrier_wait(
+                        acc_empty_mbar_ptr + acc_buf_idx, acc_empty_phase
+                    )
+
+                #
+                # Reset the ACCUMULATE field for each tile
+                #
+                tiled_mma.set(tcgen05.Field.ACCUMULATE, False)
+
+                #
+                # Mma mainloop
+                #
+                for k_block in cutlass.range_dynamic(0, cur_k_block_cnt, 1, unroll=1):
+                    mma_rd_k_block_next = cutlass.Int32(k_block + 1)
+                    smem_rd_buffer_next = (
+                        num_prev_k_blk + mma_rd_k_block_next
+                    ) % self.num_ab_stage
+                    mma_rd_ab_full_phase_next = (
+                        mma_rd_ab_full_phase ^ 1
+                        if smem_rd_buffer_next == 0
+                        else mma_rd_ab_full_phase
+                    )
+                    if is_leader_cta:
+                        # Wait for AB buffer full
+                        if peek_ab_full_status == 0:
+                            cute.arch.mbarrier_wait(
+                                ab_full_mbar_ptr + smem_rd_buffer, mma_rd_ab_full_phase
+                            )
+
+                        # tCtAcc += tCrA * tCrB
+                        num_kphases = cute.size(tCrA, mode=[2])
+                        for kphase_idx in range(num_kphases):
+                            kphase_coord = (None, None, kphase_idx, smem_rd_buffer)
+
+                            cute.gemm(
+                                tiled_mma,
+                                tCtAcc,
+                                tCrA[kphase_coord],
+                                tCrB[kphase_coord],
+                                tCtAcc,
+                            )
+                            # Enable accumulate on tCtAcc after first kphase
+                            tiled_mma.set(tcgen05.Field.ACCUMULATE, True)
+
+                        # Async arrive AB buffer empty
+                        with cute.arch.elect_one():
+                            tcgen05.commit(
+                                ab_empty_mbar_ptr + smem_rd_buffer,
+                                ab_empty_mcast_mask,
+                                self.cta_group,
+                            )
+
+                    # Peek (try_wait) AB buffer full for k_block = k_block + 1
+                    need_check_rd_buffer_full = (
+                        mma_rd_k_block_next < cur_k_block_cnt and is_leader_cta
+                    )
+
+                    peek_ab_full_status = cute.arch.conditional_mbarrier_try_wait(
+                        need_check_rd_buffer_full,
+                        ab_full_mbar_ptr + smem_rd_buffer_next,
+                        mma_rd_ab_full_phase_next,
+                    )
+
+                    mma_rd_k_block = mma_rd_k_block_next
+                    smem_rd_buffer = smem_rd_buffer_next
+                    mma_rd_ab_full_phase = mma_rd_ab_full_phase_next
+
+                #
+                # Async arrive accumulator buffer full
+                #
+                if is_leader_cta:
+                    with cute.arch.elect_one():
+                        tcgen05.commit(
+                            acc_full_mbar_ptr + acc_buf_idx,
+                            acc_full_mcast_mask,
+                            self.cta_group,
+                        )
+
+                #
+                # Advance to next tile
+                #
+                tile_sched.advance_to_next_work()
+                work_tile = tile_sched.get_current_work()
+
+        #
+        # Specialized epilogue warps
+        #
+        if warp_idx < self.mma_warp_id:
+            # initialize tensorap for C
+            tensormap_manager.init_tensormap_from_atom(
+                tma_atom_c,
+                tensormap_c_init_ptr,
+                self.epilog_warp_id[0],
+            )
+            # Alloc tensor memory buffer
+            if warp_idx == self.epilog_warp_id[0]:
+                cute.arch.alloc_tmem(
+                    self.num_tmem_alloc_cols,
+                    tmem_holding_buf,
+                    is_two_cta=use_2cta_instrs,
+                )
+
+            #
+            # Bar sync for retrieve tensor memory ptr from shared memory
+            #
+            tmem_ptr_read_threads = 32 * len((self.mma_warp_id, *self.epilog_warp_id))
+            cute.arch.barrier(
+                barrier_id=self.tmem_ptr_sync_bar_id,
+                number_of_threads=tmem_ptr_read_threads,
+            )
+
+            #
+            # Retrieving tensor memory ptr and make accumulator tensor
+            #
+            tmem_ptr = cute.arch.retrieve_tmem_ptr(
+                self.acc_dtype,
+                alignment=16,
+                ptr_to_buffer_holding_addr=tmem_holding_buf,
+            )
+            # (MMA, MMA_M, MMA_N, STAGE)
+            tCtAcc_base = cute.make_tensor(tmem_ptr, tCtAcc_fake.layout)
+
+            epi_tidx = tidx
+            #
+            # Partition for epilogue
+            #
+            tiled_copy_t2r, tTR_tAcc_base, tTR_rAcc = (
+                self.epilog_tmem_copy_and_partition(
+                    epi_tidx, tCtAcc_base, tCgC, epi_tile, use_2cta_instrs
+                )
+            )
+
+            tTR_rC = cute.make_fragment(tTR_rAcc.shape, self.c_dtype)
+            tiled_copy_r2s, tRS_rC, tRS_sC = self.epilog_smem_copy_and_partition(
+                tiled_copy_t2r, tTR_rC, epi_tidx, sC
+            )
+            tma_atom_c, bSG_sC, bSG_gC_partitioned = (
+                self.epilog_gmem_copy_and_partition(tma_atom_c, tCgC, epi_tile, sC)
+            )
+
+            #
+            # Persistent tile scheduling loop
+            #
+            tile_sched = utils.StaticPersistentTileScheduler.create(
+                tile_sched_params, bid, grid_dim
+            )
+            # grouped gemm tile scheduler helper will compute the group index for the tile we're working on
+            group_gemm_ts_helper = utils.GroupedGemmTileSchedulerHelper(
+                group_count,
+                tile_sched_params,
+                self.cluster_tile_shape_mnk,
+                utils.create_initial_search_state(),
+            )
+
+            work_tile = tile_sched.initial_work_tile_info()
+            # wait tensormap initialization complete before update
+            tensormap_manager.fence_tensormap_initialization()
+            # tile count we have searched
+            total_k_block_cnt = cutlass.Int32(0)
+            # group index of last tile
+            last_group_idx = cutlass.Int32(-1)
+            while work_tile.is_valid_tile:
+                cur_tile_coord = work_tile.tile_idx
+                grouped_gemm_cta_tile_info = group_gemm_ts_helper.delinearize_z(
+                    cur_tile_coord,
+                    problem_sizes_mnkl,
+                )
+                cur_group_idx = grouped_gemm_cta_tile_info.group_idx
+                is_group_changed = cur_group_idx != last_group_idx
+                if is_group_changed:
+                    # construct tensor C based on real address, shape and stride information
+                    real_tensor_c = self.make_tensor_for_tensormap_update(
+                        cur_group_idx,
+                        self.c_dtype,
+                        (
+                            grouped_gemm_cta_tile_info.problem_shape_m,
+                            grouped_gemm_cta_tile_info.problem_shape_n,
+                            grouped_gemm_cta_tile_info.problem_shape_k,
+                        ),
+                        strides_abc,
+                        ptrs_abc,
+                        2,  # 2 for tensor C
+                    )
+                    tensormap_manager.update_tensormap(
+                        ((real_tensor_c),),
+                        ((tma_atom_c),),
+                        ((tensormap_c_ptr),),
+                        self.epilog_warp_id[0],
+                        (tensormap_c_smem_ptr,),
+                    )
+
+                mma_tile_coord_mnl = (
+                    grouped_gemm_cta_tile_info.cta_tile_idx_m
+                    // cute.size(tiled_mma.thr_id.shape),
+                    grouped_gemm_cta_tile_info.cta_tile_idx_n,
+                    0,
+                )
+                cur_k_block_cnt = grouped_gemm_cta_tile_info.cta_tile_count_k
+                total_k_block_cnt += cur_k_block_cnt
+
+                #
+                # Slice to per mma tile index
+                #
+                # ((ATOM_V, REST_V), EPI_M, EPI_N)
+                bSG_gC = bSG_gC_partitioned[
+                    (
+                        None,
+                        None,
+                        None,
+                        *mma_tile_coord_mnl,
+                    )
+                ]
+
+                # Set tensor memory buffer for current tile
+                acc_buf_idx = tile_sched.num_tiles_executed % self.num_acc_stage
+                # (T2R, T2R_M, T2R_N, EPI_M, EPI_M)
+                tTR_tAcc = tTR_tAcc_base[(None, None, None, None, None, acc_buf_idx)]
+
+                #
+                # Wait for accumulator buffer full
+                #
+                acc_full_phase = tile_sched.num_tiles_executed // self.num_acc_stage % 2
+                cute.arch.mbarrier_wait(acc_full_mbar_ptr + acc_buf_idx, acc_full_phase)
+
+                tTR_tAcc = cute.group_modes(tTR_tAcc, 3, cute.rank(tTR_tAcc))
+                bSG_gC = cute.group_modes(bSG_gC, 1, cute.rank(bSG_gC))
+                # ensure the update to tensormap has completed before using it
+                if is_group_changed:
+                    if warp_idx == self.epilog_warp_id[0]:
+                        tensormap_manager.fence_tensormap_update(tensormap_c_ptr)
+                #
+                # Store accumulator to global memory in subtiles
+                #
+                subtile_cnt = cute.size(tTR_tAcc.shape, mode=[3])
+                num_prev_subtiles = tile_sched.num_tiles_executed * subtile_cnt
+                for subtile_idx in cutlass.range_dynamic(subtile_cnt):
+                    #
+                    # Load accumulator from tensor memory buffer to register
+                    #
+                    tTR_tAcc_mn = tTR_tAcc[(None, None, None, subtile_idx)]
+                    cute.copy(tiled_copy_t2r, tTR_tAcc_mn, tTR_rAcc)
+
+                    #
+                    # Convert to output type
+                    #
+                    acc_vec = tiled_copy_r2s.retile(tTR_rAcc).load()
+                    tRS_rC.store(acc_vec.to(self.c_dtype))
+                    #
+                    # Store C to shared memory
+                    #
+                    epi_buffer = (num_prev_subtiles + subtile_idx) % self.num_epi_stage
+                    cute.copy(
+                        tiled_copy_r2s,
+                        tRS_rC,
+                        tRS_sC[(None, None, None, epi_buffer)],
+                    )
+                    # Fence and barrier to make sure shared memory store is visible to TMA store
+                    cute.arch.fence_proxy(
+                        cute.arch.ProxyKind.async_shared,
+                        space=cute.arch.SharedSpace.shared_cta,
+                    )
+                    epilog_threads = 32 * len(self.epilog_warp_id)
+                    cute.arch.barrier(
+                        barrier_id=self.epilog_sync_bar_id,
+                        number_of_threads=epilog_threads,
+                    )
+                    #
+                    # store C to global memory with TMA
+                    #
+                    if warp_idx == self.epilog_warp_id[0]:
+                        cute.copy(
+                            tma_atom_c,
+                            bSG_sC[(None, epi_buffer)],
+                            bSG_gC[(None, subtile_idx)],
+                            tma_desc_ptr=tensormap_manager.get_tensormap_ptr(
+                                tensormap_c_ptr,
+                                cute.AddressSpace.generic,
+                            ),
+                        )
+                        cute.arch.cp_async_bulk_commit_group()
+                        cute.arch.cp_async_bulk_wait_group(
+                            self.num_epi_stage - 1, read=True
+                        )
+                    cute.arch.barrier(
+                        barrier_id=self.epilog_sync_bar_id,
+                        number_of_threads=epilog_threads,
+                    )
+                #
+                # Async arrive accumulator buffer empty
+                #
+                with cute.arch.elect_one():
+                    cute.arch.mbarrier_arrive(
+                        acc_empty_mbar_ptr + acc_buf_idx,
+                        cta_rank_in_cluster // 2 * 2 if use_2cta_instrs else None,
+                    )
+
+                #
+                # Advance to next tile
+                #
+                tile_sched.advance_to_next_work()
+                work_tile = tile_sched.get_current_work()
+                last_group_idx = cur_group_idx
+
+            #
+            # Dealloc the tensor memory buffer
+            #
+            if warp_idx == self.epilog_warp_id[0]:
+                cute.arch.relinquish_tmem_alloc_permit(is_two_cta=use_2cta_instrs)
+            epilog_threads = 32 * len(self.epilog_warp_id)
+            cute.arch.barrier(
+                barrier_id=self.epilog_sync_bar_id, number_of_threads=epilog_threads
+            )
+            if warp_idx == self.epilog_warp_id[0]:
+                if use_2cta_instrs:
+                    cute.arch.mbarrier_arrive(
+                        tmem_dealloc_mbar_ptr, cta_rank_in_cluster ^ 1
+                    )
+                    cute.arch.mbarrier_wait(tmem_dealloc_mbar_ptr, 0)
+                cute.arch.dealloc_tmem(
+                    tmem_ptr, self.num_tmem_alloc_cols, is_two_cta=use_2cta_instrs
+                )
+
+            #
+            # Wait a/b buffer empty
+            #
+            if warp_idx == self.epilog_warp_id[0]:
+                cute.arch.mbarrier_wait(
+                    (ab_empty_mbar_ptr + ((total_k_block_cnt - 1) % self.num_ab_stage)),
+                    (((total_k_block_cnt - 1) // self.num_ab_stage) % 2),
+                )
+
+    @cute.jit
+    def make_tensor_for_tensormap_update_old(
+        self,
+        group_idx: cutlass.Int32,
+        dtype: Type[cutlass.Numeric],
+        problem_shape_mnk: tuple[cutlass.Int32, cutlass.Int32, cutlass.Int32],
+        strides_abc: cute.Tensor,
+        tensor_address_abc: cute.Tensor,
+        tensor_index: int,
+    ):
+        """Extract stride and tensor address for a given group and construct a global tensor."""
+        ptr_i64 = tensor_address_abc[(group_idx, tensor_index)]
+        if cutlass.const_expr(
+            not isclass(dtype) or not issubclass(dtype, cutlass.Numeric)
+        ):
+            raise TypeError(
+                f"dtype must be a type of cutlass.Numeric, got {type(dtype)}"
+            )
+        tensor_gmem_ptr = cute.make_ptr(
+            dtype, ptr_i64, cute.AddressSpace.gmem, assumed_align=16
+        )
+
+        strides_tensor_gmem = strides_abc[(group_idx, tensor_index, None)]
+        strides_tensor_reg = cute.make_fragment(
+            cute.make_layout(2),
+            strides_abc.element_type,
+        )
+        cute.autovec_copy(strides_tensor_gmem, strides_tensor_reg)
+
+        # Extract the actual stride values directly from the register
+        stride_m = strides_tensor_reg[0]  # First stride value
+        stride_n = strides_tensor_reg[1]  # Second stride value
+        c1 = cutlass.Int32(1)
+        c0 = cutlass.Int32(0)
+
+        if cutlass.const_expr(tensor_index == 0):  # tensor A (M, K, 1)
+            m = problem_shape_mnk[0]
+            k = problem_shape_mnk[2]
+            return cute.make_tensor(
+                tensor_gmem_ptr,
+                cute.make_layout((m, k, c1), stride=(stride_m, stride_n, c0)),
+            )
+        elif cutlass.const_expr(tensor_index == 1):  # tensor B (N, K, 1)
+            n = problem_shape_mnk[1]
+            k = problem_shape_mnk[2]
+            return cute.make_tensor(
+                tensor_gmem_ptr,
+                cute.make_layout((n, k, c1), stride=(stride_m, stride_n, c0)),
+            )
+        else:  # tensor C (M, N, 1)
+            m = problem_shape_mnk[0]
+            n = problem_shape_mnk[1]
+            return cute.make_tensor(
+                tensor_gmem_ptr,
+                cute.make_layout((m, n, c1), stride=(stride_m, stride_n, c0)),
+            )
+
+    @cute.jit
+    def make_tensor_for_tensormap_update(
+        self,
+        group_idx: cutlass.Int32,
+        dtype: Type[cutlass.Numeric],
+        problem_shape_mnk: tuple[cutlass.Int32, cutlass.Int32, cutlass.Int32],
+        strides_abc: cute.Tensor,
+        tensor_address_abc: cute.Tensor,
+        tensor_index: int,
+    ):
+        """
+        Fixed version: Extract stride and tensor address for a given group and construct a global tensor.
+
+        """
+        ptr_i64 = tensor_address_abc[(group_idx, tensor_index)]
+        if cutlass.const_expr(
+            not isclass(dtype) or not issubclass(dtype, cutlass.Numeric)
+        ):
+            raise TypeError(
+                f"dtype must be a type of cutlass.Numeric, got {type(dtype)}"
+            )
+        tensor_gmem_ptr = cute.make_ptr(
+            dtype, ptr_i64, cute.AddressSpace.gmem, assumed_align=16
+        )
+
+        strides_tensor_gmem = strides_abc[(group_idx, tensor_index, None)]
+        strides_tensor_reg = cute.make_fragment(
+            cute.make_layout(2),
+            strides_abc.element_type,
+        )
+        cute.autovec_copy(strides_tensor_gmem, strides_tensor_reg)
+
+        # Extract the actual stride values
+        stride_0 = strides_tensor_reg[0]  # Stride for dimension 0
+        stride_1 = strides_tensor_reg[1]  # Stride for dimension 1
+        c1 = cutlass.Int32(1)
+        c0 = cutlass.Int32(0)
+
+        if cutlass.const_expr(tensor_index == 0):  # tensor A
+            # A has shape (M, K, 1) in MNKL format
+            # strides are (stride_M, stride_K)
+            m = problem_shape_mnk[0]
+            k = problem_shape_mnk[2]
+            return cute.make_tensor(
+                tensor_gmem_ptr,
+                cute.make_layout((m, k, c1), stride=(stride_0, stride_1, c0)),
+            )
+        elif cutlass.const_expr(tensor_index == 1):  # tensor B
+            # B has shape (N, K, 1) in MNKL format (note: transposed from original K,N)
+            # strides are (stride_N, stride_K)
+            n = problem_shape_mnk[1]
+            k = problem_shape_mnk[2]
+            return cute.make_tensor(
+                tensor_gmem_ptr,
+                cute.make_layout((n, k, c1), stride=(stride_0, stride_1, c0)),
+            )
+        else:  # tensor C
+            # C has shape (M, N, 1) in MNKL format
+            # strides are (stride_M, stride_N)
+            m = problem_shape_mnk[0]
+            n = problem_shape_mnk[1]
+            return cute.make_tensor(
+                tensor_gmem_ptr,
+                cute.make_layout((m, n, c1), stride=(stride_0, stride_1, c0)),
+            )
+
+    @cute.jit
+    def make_tensor_for_tensormap_update_old(
+        self,
+        group_idx: cutlass.Int32,
+        dtype: Type[cutlass.Numeric],
+        problem_shape_mnk: tuple[cutlass.Int32, cutlass.Int32, cutlass.Int32],
+        strides_abc: cute.Tensor,
+        tensor_address_abc: cute.Tensor,
+        tensor_index: int,
+    ):
+        """Extract stride and tensor address for a given group and construct a global tensor.
+
+        This function is used within the kernel to dynamically create a CUTE tensor
+        representing A, B, or C for the current group being processed, using the
+        group-specific address, shape, and stride information.
+
+        :param group_idx: The index of the current group within the grouped GEMM.
+        :type group_idx: cutlass.Int32
+        :param dtype: The data type of the tensor elements (e.g., cutlass.Float16).
+        :type dtype: Type[cutlass.Numeric]
+        :param problem_shape_mnk: The (M, N, K) problem shape for the current group.
+        :type problem_shape_mnk: tuple[cutlass.Int32, cutlass.Int32, cutlass.Int32]
+        :param strides_abc: Tensor containing strides for A, B, C for all groups. Layout: (group_count, 3, 2).
+        :type strides_abc: cute.Tensor
+        :param tensor_address_abc: Tensor containing global memory addresses for A, B, C for all groups. Layout: (group_count, 3).
+        :type tensor_address_abc: cute.Tensor
+        :param tensor_index: Specifies which tensor to create: 0 for A, 1 for B, 2 for C.
+        :type tensor_index: int
+        :return: A CUTE tensor representing the requested global memory tensor (A, B, or C) for the specified group.
+        :rtype: cute.Tensor
+        :raises TypeError: If the provided dtype is not a subclass of cutlass.Numeric.
+        """
+        ptr_i64 = tensor_address_abc[(group_idx, tensor_index)]
+        if cutlass.const_expr(
+            not isclass(dtype) or not issubclass(dtype, cutlass.Numeric)
+        ):
+            raise TypeError(
+                f"dtype must be a type of cutlass.Numeric, got {type(dtype)}"
+            )
+        tensor_gmem_ptr = cute.make_ptr(
+            dtype, ptr_i64, cute.AddressSpace.gmem, assumed_align=16
+        )
+
+        strides_tensor_gmem = strides_abc[(group_idx, tensor_index, None)]
+        strides_tensor_reg = cute.make_fragment(
+            cute.make_layout(2),
+            strides_abc.element_type,
+        )
+        cute.autovec_copy(strides_tensor_gmem, strides_tensor_reg)
+        stride_mn = strides_tensor_reg[0]
+        stride_k = strides_tensor_reg[1]
+        c1 = cutlass.Int32(1)
+        c0 = cutlass.Int32(0)
+
+        if cutlass.const_expr(tensor_index == 0):  # tensor A
+            m = problem_shape_mnk[0]
+            k = problem_shape_mnk[2]
+            return cute.make_tensor(
+                tensor_gmem_ptr,
+                cute.make_layout((m, k, c1), stride=(stride_mn, stride_k, c0)),
+            )
+        elif cutlass.const_expr(tensor_index == 1):  # tensor B
+            n = problem_shape_mnk[1]
+            k = problem_shape_mnk[2]
+            return cute.make_tensor(
+                tensor_gmem_ptr,
+                cute.make_layout((n, k, c1), stride=(stride_mn, stride_k, c0)),
+            )
+        else:  # tensor C
+            m = problem_shape_mnk[0]
+            n = problem_shape_mnk[1]
+            return cute.make_tensor(
+                tensor_gmem_ptr,
+                cute.make_layout((m, n, c1), stride=(stride_mn, stride_k, c0)),
+            )
+
+    def epilog_tmem_copy_and_partition(
+        self,
+        tidx: cutlass.Int32,
+        tAcc: cute.Tensor,
+        gC_mnl: cute.Tensor,
+        epi_tile: cute.Tile,
+        use_2cta_instrs: Union[cutlass.Boolean, bool],
+    ) -> tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]:
+        """
+        Make tiledCopy for tensor memory load, then use it to partition tensor memory (source) and register array (destination).
+
+        :param tidx: The thread index in epilogue warp groups
+        :type tidx: cutlass.Int32
+        :param tAcc: The accumulator tensor to be copied and partitioned
+        :type tAcc: cute.Tensor
+        :param gC_mnl: The global tensor C
+        :type gC_mnl: cute.Tensor
+        :param epi_tile: The epilogue tiler
+        :type epi_tile: cute.Tile
+        :param use_2cta_instrs: Whether use_2cta_instrs is enabled
+        :type use_2cta_instrs: bool
+
+        :return: A tuple containing (tiled_copy_t2r, tTR_tAcc, tTR_rAcc) where:
+            - tiled_copy_t2r: The tiled copy operation for tmem to register copy(t2r)
+            - tTR_tAcc: The partitioned accumulator tensor
+            - tTR_rAcc: The accumulated tensor in register used to hold t2r results
+        :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]
+        """
+        # Make tiledCopy for tensor memory load(t2r)
+        copy_atom_t2r = sm100_utils.get_tmem_load_op(
+            self.cta_tile_shape_mnk,
+            self.c_layout,
+            self.c_dtype,
+            self.acc_dtype,
+            epi_tile,
+            use_2cta_instrs,
+        )
+        # (EPI_TILE_M, EPI_TILE_N, EPI_M, EPI_N, STAGE)
+        tAcc_epi = cute.flat_divide(
+            tAcc[((None, None), 0, 0, None)],
+            epi_tile,
+        )
+        # (EPI_TILE_M, EPI_TILE_N)
+        tiled_copy_t2r = tcgen05.make_tmem_copy(
+            copy_atom_t2r, tAcc_epi[(None, None, 0, 0, 0)]
+        )
+
+        thr_copy_t2r = tiled_copy_t2r.get_slice(tidx)
+        # (T2R, T2R_M, T2R_N, EPI_M, EPI_M, STAGE)
+        tTR_tAcc = thr_copy_t2r.partition_S(tAcc_epi)
+
+        # (EPI_TILE_M, EPI_TILE_N, EPI_M, EPI_N, loopM, loopN, loopL)
+        gC_mnl_epi = cute.flat_divide(
+            gC_mnl[((None, None), 0, 0, None, None, None)], epi_tile
+        )
+        # (T2R, T2R_M, T2R_N, EPI_M, EPI_N, loopM, loopN, loopL)
+        tTR_gC = thr_copy_t2r.partition_D(gC_mnl_epi)
+        # (T2R, T2R_M, T2R_N)
+        tTR_rAcc = cute.make_fragment(
+            tTR_gC[(None, None, None, 0, 0, 0, 0, 0)].shape, self.acc_dtype
+        )
+        return tiled_copy_t2r, tTR_tAcc, tTR_rAcc
+
+    def epilog_smem_copy_and_partition(
+        self,
+        tiled_copy_t2r: cute.TiledCopy,
+        tTR_rC: cute.Tensor,
+        tidx: cutlass.Int32,
+        sC: cute.Tensor,
+    ) -> tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]:
+        """
+        Make tiledCopy for shared memory store, then use it to partition register array (source) and shared memory (destination).
+
+        :param tiled_copy_t2r: The tiled copy operation for tmem to register copy(t2r)
+        :type tiled_copy_t2r: cute.TiledCopy
+        :param tTR_rC: The partitioned accumulator tensor
+        :type tTR_rC: cute.Tensor
+        :param tidx: The thread index in epilogue warp groups
+        :type tidx: cutlass.Int32
+        :param sC: The shared memory tensor to be copied and partitioned
+        :type sC: cute.Tensor
+
+        :return: A tuple containing (tiled_copy_r2s, tRS_rC, tRS_sC) where:
+            - tiled_copy_r2s: The tiled copy operation for register to smem copy(r2s)
+            - tRS_rC: The partitioned tensor C (register source)
+            - tRS_sC: The partitioned tensor C (smem destination)
+        :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]
+        """
+        copy_atom_r2s = sm100_utils.get_smem_store_op(
+            self.c_layout, self.c_dtype, self.acc_dtype, tiled_copy_t2r
+        )
+        tiled_copy_r2s = cute.make_tiled_copy(
+            copy_atom_r2s,
+            layout_tv=tiled_copy_t2r.layout_dst_tv_tiled,
+            tiler_mn=tiled_copy_t2r.tiler_mn,
+        )
+        # (R2S, R2S_M, R2S_N, PIPE_D)
+        thr_copy_r2s = tiled_copy_r2s.get_slice(tidx)
+        tRS_sC = thr_copy_r2s.partition_D(sC)
+        # (R2S, R2S_M, R2S_N)
+        tRS_rC = tiled_copy_r2s.retile(tTR_rC)
+        return tiled_copy_r2s, tRS_rC, tRS_sC
+
+    def epilog_gmem_copy_and_partition(
+        self,
+        tma_atom_c: cute.CopyAtom,
+        gC_mnl: cute.Tensor,
+        epi_tile: cute.Tile,
+        sC: cute.Tensor,
+    ) -> tuple[cute.CopyAtom, cute.Tensor, cute.Tensor]:
+        """Make tiledCopy for global memory store, then use it to partition
+        shared memory (source) and global memory (destination) for TMA store version.
+
+        :param tma_atom_c: The TMA copy atom configured for storing tensor C.
+        :type tma_atom_c: cute.CopyAtom
+        :param gC_mnl: The global memory tensor C.
+        :type gC_mnl: cute.Tensor
+        :param epi_tile: The epilogue tiler defining the granularity of the operation.
+        :type epi_tile: cute.Tile
+        :param sC: The shared memory epilogue buffer tensor.
+        :type sC: cute.Tensor
+        :return: A tuple containing:
+                 - tma_atom_c: The input TMA copy atom (passed through).
+                 - bSG_sC: The source shared memory tensor partitioned for the TMA operation.
+                 - tCgC: The destination global memory tensor partitioned for the TMA operation.
+        :rtype: tuple[cute.CopyAtom, cute.Tensor, cute.Tensor]
+        """
+        # (EPI_TILE_M, EPI_TILE_N, EPI_M, EPI_N, loopM, loopN, loopL)
+        gC_epi = cute.flat_divide(
+            gC_mnl[((None, None), 0, 0, None, None, None)], epi_tile
+        )
+        sC_for_tma_partition = cute.group_modes(sC, 0, 2)
+        gC_for_tma_partition = cute.group_modes(gC_epi, 0, 2)
+        # ((ATOM_V, REST_V), EPI_M, EPI_N)
+        # ((ATOM_V, REST_V), EPI_M, EPI_N, loopM, loopN, loopL)
+        bSG_sC, bSG_gC = cpasync.tma_partition(
+            tma_atom_c,
+            0,
+            cute.make_layout(1),
+            sC_for_tma_partition,
+            gC_for_tma_partition,
+        )
+        return tma_atom_c, bSG_sC, bSG_gC
+
+    @staticmethod
+    def _compute_stages(
+        tiled_mma: cute.TiledMma,
+        mma_tiler_mnk: tuple[int, int, int],
+        a_dtype: type[cutlass.Numeric],
+        b_dtype: type[cutlass.Numeric],
+        epi_tile: cute.Tile,
+        c_dtype: type[cutlass.Numeric],
+        c_layout: utils.LayoutEnum,
+        num_smem_capacity: int,
+        occupancy: int,
+    ) -> tuple[int, int, int]:
+        """Computes the number of stages for accumulator, A/B operands, and epilogue based on heuristics.
+
+        :param tiled_mma: The tiled MMA object defining the core computation.
+        :type tiled_mma: cute.TiledMma
+        :param mma_tiler_mnk: The shape (M, N, K) of the MMA tiler.
+        :type mma_tiler_mnk: tuple[int, int, int]
+        :param a_dtype: Data type of operand A.
+        :type a_dtype: type[cutlass.Numeric]
+        :param b_dtype: Data type of operand B.
+        :type b_dtype: type[cutlass.Numeric]
+        :param epi_tile: The epilogue tile shape.
+        :type epi_tile: cute.Tile
+        :param c_dtype: Data type of operand C (output).
+        :type c_dtype: type[cutlass.Numeric]
+        :param c_layout: Layout enum of operand C in global memory.
+        :type c_layout: utils.LayoutEnum
+        :param num_smem_capacity: Total available shared memory capacity in bytes.
+        :type num_smem_capacity: int
+        :param occupancy: Target number of CTAs per SM (occupancy).
+        :type occupancy: int
+
+        :return: A tuple containing the computed number of stages for:
+                 (accumulator stages, A/B operand stages, epilogue stages)
+        :rtype: tuple[int, int, int]
+        """
+        # Default accumulator and epilogue stages
+        num_acc_stage = 2
+        num_epi_stage = 2
+
+        # Calculate smem layout and size for one stage of A, B, and Epilogue
+        a_smem_layout_stage_one = sm100_utils.make_smem_layout_a(
+            tiled_mma,
+            mma_tiler_mnk,
+            a_dtype,
+            1,  # stage=1
+        )
+        b_smem_layout_staged_one = sm100_utils.make_smem_layout_b(
+            tiled_mma,
+            mma_tiler_mnk,
+            b_dtype,
+            1,  # stage=1
+        )
+        epi_smem_layout_staged_one = sm100_utils.make_smem_layout_epi(
+            c_dtype,
+            c_layout,
+            epi_tile,
+            1,  # stage=1
+        )
+        ab_bytes_per_stage = cute.size_in_bytes(
+            a_dtype, a_smem_layout_stage_one
+        ) + cute.size_in_bytes(b_dtype, b_smem_layout_staged_one)
+
+        epi_bytes_per_stage = cute.size_in_bytes(c_dtype, epi_smem_layout_staged_one)
+        epi_bytes = epi_bytes_per_stage * num_epi_stage
+
+        # Calculate A/B stages:
+        # Start with total smem per CTA (capacity / occupancy)
+        # Subtract reserved bytes and initial epilogue bytes
+        # Divide remaining by bytes needed per A/B stage
+        num_ab_stage = (
+            num_smem_capacity // occupancy
+            - GroupedGemmKernel.reserved_smem_bytes
+            - epi_bytes
+        ) // ab_bytes_per_stage
+
+        # Refine epilogue stages:
+        # Calculate remaining smem after allocating for A/B stages and reserved bytes
+        # Add remaining unused smem to epilogue
+        remaining_smem = (
+            num_smem_capacity
+            - occupancy * ab_bytes_per_stage * num_ab_stage
+            - occupancy * (GroupedGemmKernel.reserved_smem_bytes + epi_bytes)
+        )
+        num_epi_stage += remaining_smem // (occupancy * epi_bytes_per_stage)
+        return num_acc_stage, num_ab_stage, num_epi_stage
+
+    @staticmethod
+    def _compute_grid(
+        total_num_clusters: int,
+        cluster_shape_mn: tuple[int, int],
+        max_active_clusters: cutlass.Constexpr[int],
+    ) -> tuple[utils.PersistentTileSchedulerParams, tuple[int, int, int]]:
+        """Compute tile scheduler parameters and grid shape for grouped GEMM operations.
+
+        :param total_num_clusters: Total number of clusters to process across all groups.
+        :type total_num_clusters: int
+        :param cluster_shape_mn: Shape of each cluster in M, N dimensions.
+        :type cluster_shape_mn: tuple[int, int]
+        :param max_active_clusters: Maximum number of active clusters.
+        :type max_active_clusters: cutlass.Constexpr[int]
+
+        :return: A tuple containing:
+            - tile_sched_params: Parameters for the persistent tile scheduler.
+            - grid: Grid shape for kernel launch.
+        :rtype: tuple[utils.PersistentTileSchedulerParams, tuple[int, ...]]
+        """
+        # Create problem shape with M, N dimensions from cluster shape
+        # and L dimension representing the total number of clusters.
+        problem_shape_ntile_mnl = (
+            cluster_shape_mn[0],
+            cluster_shape_mn[1],
+            cutlass.Int32(total_num_clusters),
+        )
+
+        tile_sched_params = utils.PersistentTileSchedulerParams(
+            problem_shape_ntile_mnl, (*cluster_shape_mn, 1)
+        )
+
+        grid = utils.StaticPersistentTileScheduler.get_grid_shape(
+            tile_sched_params, max_active_clusters
+        )
+
+        return tile_sched_params, grid
+
+    @staticmethod
+    def _get_mbar_smem_bytes(**kwargs_stages: int) -> int:
+        """Calculate shared memory consumption for memory barriers based on provided stages.
+
+        Each stage requires 2 barriers, and each barrier consumes 8 bytes of shared memory.
+        The total consumption is the sum across all provided stages. This function calculates the total
+        shared memory needed for these barriers.
+
+        :param kwargs_stages: Variable keyword arguments where each key is a stage name
+                              (e.g., num_acc_stage, num_ab_stage) and each value is the
+                              number of stages of that type.
+        :type kwargs_stages: int
+        :return: Total shared memory bytes required for all memory barriers.
+        :rtype: int
+        """
+        num_barriers_per_stage = 2
+        num_bytes_per_barrier = 8
+        mbar_smem_consumption = sum(
+            [
+                num_barriers_per_stage * num_bytes_per_barrier * stage
+                for stage in kwargs_stages.values()
+            ]
+        )
+        return mbar_smem_consumption
+
+    @staticmethod
+    def _get_tensormap_smem_bytes(
+        tensormap_update_mode: utils.TensorMapUpdateMode,
+    ) -> int:
+        """Get the SMEM consumption for the tensormap buffer based on the update mode.
+
+        :param tensormap_update_mode: Specifies whether tensormaps are updated in GMEM or SMEM.
+        :type tensormap_update_mode: utils.TensorMapUpdateMode
+        :return: The shared memory bytes required for the tensormap buffer. Returns 0 if mode is GMEM.
+        :rtype: int
+        :raises ValueError: If an invalid tensormap update mode is provided.
+        """
+        if tensormap_update_mode == utils.TensorMapUpdateMode.GMEM:
+            return 0
+        elif tensormap_update_mode == utils.TensorMapUpdateMode.SMEM:
+            return (
+                GroupedGemmKernel.bytes_per_tensormap * GroupedGemmKernel.num_tensormaps
+            )
+        else:
+            raise ValueError(f"Invalid tensormap update mode: {tensormap_update_mode}")
+
+    @staticmethod
+    def _get_tensor_smem_bytes(
+        a_smem_layout_staged: cute.Layout,
+        a_dtype: Type[cutlass.Numeric],
+        b_smem_layout_staged: cute.Layout,
+        b_dtype: Type[cutlass.Numeric],
+        epi_smem_layout_staged: cute.Layout,
+        c_dtype: Type[cutlass.Numeric],
+    ) -> int:
+        """Compute the total SMEM consumption for tensor A, B and C."""
+        ab_bytes = cute.size_in_bytes(
+            a_dtype, a_smem_layout_staged
+        ) + cute.size_in_bytes(b_dtype, b_smem_layout_staged)
+
+        epi_bytes = cute.size_in_bytes(c_dtype, epi_smem_layout_staged)
+        return ab_bytes + epi_bytes
+
+    @staticmethod
+    def _get_tma_atom_kind(atom_sm_cnt: int, mcast: bool):
+        """Select the appropriate TMA copy atom based on the number of SMs and the multicast flag."""
+        if atom_sm_cnt == 2 and mcast:
+            return cpasync.CopyBulkTensorTileG2SMulticastOp(tcgen05.CtaGroup.TWO)
+        elif atom_sm_cnt == 2 and not mcast:
+            return cpasync.CopyBulkTensorTileG2SOp(tcgen05.CtaGroup.TWO)
+        elif atom_sm_cnt == 1 and mcast:
+            return cpasync.CopyBulkTensorTileG2SMulticastOp(tcgen05.CtaGroup.ONE)
+        elif atom_sm_cnt == 1 and not mcast:
+            return cpasync.CopyBulkTensorTileG2SOp(tcgen05.CtaGroup.ONE)
+
+        raise ValueError(f"Invalid atom_sm_cnt: {atom_sm_cnt} and {mcast}")
+
+    @staticmethod
+    def _compute_num_tmem_alloc_cols(
+        tiled_mma: cute.TiledMma,
+        mma_tiler: tuple[int, int, int],
+        num_acc_stage: int,
+    ) -> int:
+        """
+        Compute the number of tensor memory allocation columns.
+
+        :param tiled_mma: The tiled MMA object defining the core computation.
+        :type tiled_mma: cute.TiledMma
+        :param mma_tiler: The shape (M, N, K) of the MMA tile.
+        :type mma_tiler: tuple[int, int, int]
+        :param acc_stage: The stage of the accumulator tensor.
+        :type acc_stage: int
+
+        :return: The number of tensor memory allocation columns.
+        :rtype: int
+        """
+        acc_shape = tiled_mma.partition_shape_C(mma_tiler[:2])
+        tCtAcc_fake = tiled_mma.make_fragment_C(cute.append(acc_shape, num_acc_stage))
+        num_tmem_alloc_cols = utils.get_num_tmem_alloc_cols(tCtAcc_fake)
+
+        return num_tmem_alloc_cols
+
+    # Size of smem we reserved for mbarrier, tensor memory management and tensormap update
+    reserved_smem_bytes = 1024
+    bytes_per_tensormap = 128
+    num_tensormaps = 3
+    # size of smem used for tensor memory management
+    tensor_memory_management_bytes = 12
+
+
+def run_grouped_gemm(
+    num_groups: int,
+    problem_sizes_mnkl: tuple[int, int, int, int],
+    ab_dtype: Type[cutlass.Numeric],
+    c_dtype: Type[cutlass.Numeric],
+    acc_dtype: Type[cutlass.Numeric],
+    a_major: str,
+    b_major: str,
+    c_major: str,
+    mma_tiler_mn: tuple[int, int],
+    cluster_shape_mn: tuple[int, int],
+    use_2cta_instrs: bool,
+    tensormap_update_mode: utils.TensorMapUpdateMode,
+    tolerance: float,
+    warmup_iterations: int,
+    iterations: int,
+    skip_ref_check: bool,
+):
+    """Run grouped GEMM example with specified configurations."""
+    print(f"Running Blackwell Grouped GEMM test with:")
+    print(f"{num_groups} groups")
+    for i, (m, n, k, l) in enumerate(problem_sizes_mnkl):
+        print(f"Group {i}: {m}x{n}x{k}x{l}")
+    print(f"AB dtype: {ab_dtype}, C dtype: {c_dtype}, Acc dtype: {acc_dtype}")
+    print(f"Matrix majors - A: {a_major}, B: {b_major}, C: {c_major}")
+    print(f"Mma Tiler (M, N): {mma_tiler_mn}, Cluster Shape (M, N): {cluster_shape_mn}")
+    print(f"2CTA MMA instructions: {'True' if use_2cta_instrs else 'False'}")
+    print(f"Tensor map update mode: {tensormap_update_mode}")
+    print(f"Tolerance: {tolerance}")
+    print(f"Warmup iterations: {warmup_iterations}")
+    print(f"Iterations: {iterations}")
+    print(f"Skip reference checking: {skip_ref_check}")
+
+    # Skip unsupported types
+    if ab_dtype not in {
+        cutlass.Float16,
+        cutlass.BFloat16,
+    }:
+        raise ValueError(f"Skip unsupported ab_dtype {ab_dtype}")
+    if c_dtype not in {cutlass.Float16, cutlass.BFloat16, cutlass.Float32}:
+        raise ValueError(f"Skip unsupported c_dtype {c_dtype}")
+    # Skip unsupported acc dtype
+    if acc_dtype not in {cutlass.Float32, cutlass.Float16}:
+        raise ValueError(f"Skip unsupported acc_dtype {acc_dtype}")
+    # Skip invalid ab_dtype and acc_dtype combination
+    if ab_dtype == cutlass.BFloat16 and acc_dtype == cutlass.Float16:
+        raise ValueError("Skip invalid ab_dtype and acc_dtype combination")
+    # Skip invalid mma tile shape
+    if not (
+        (not use_2cta_instrs and mma_tiler_mn[0] in [64, 128])
+        or (use_2cta_instrs and mma_tiler_mn[0] in [128, 256])
+    ):
+        raise ValueError(f"Skip invalid mma tiler M {mma_tiler_mn[0]}")
+    if mma_tiler_mn[1] not in range(32, 257, 32):
+        raise ValueError(f"Skip invalid mma tiler N {mma_tiler_mn[1]}")
+    # Skip illegal cluster shape
+    if cluster_shape_mn[0] % (2 if use_2cta_instrs else 1) != 0:
+        raise ValueError(
+            f"cluster_shape_m need align with use_2cta_instrs config {cluster_shape_mn}"
+        )
+    # Skip invalid cluster shape
+    is_power_of_2 = lambda x: x > 0 and (x & (x - 1)) == 0
+    if (
+        cluster_shape_mn[0] * cluster_shape_mn[1] > 16
+        or cluster_shape_mn[0] <= 0
+        or cluster_shape_mn[1] <= 0
+        or not is_power_of_2(cluster_shape_mn[0])
+        or not is_power_of_2(cluster_shape_mn[1])
+    ):
+        raise ValueError(f"Skip invalid cluster shape {cluster_shape_mn}")
+
+    # Skip illegal problem shape for load/store alignment
+    def check_contigous_16B_alignment(dtype, is_mode0_major, tensor_shape):
+        major_mode_idx = 0 if is_mode0_major else 1
+        num_major_elements = tensor_shape[major_mode_idx]
+        num_contiguous_elements = 16 * 8 // dtype.width
+        return num_major_elements % num_contiguous_elements == 0
+
+    if (
+        not check_contigous_16B_alignment(ab_dtype, a_major == "m", (m, k, l))
+        or not check_contigous_16B_alignment(ab_dtype, b_major == "n", (n, k, l))
+        or not check_contigous_16B_alignment(c_dtype, c_major == "m", (m, n, l))
+    ):
+        raise ValueError("Skip invalid problem alignment")
+    if not torch.cuda.is_available():
+        raise RuntimeError("GPU is required to run this example!")
+
+    torch.manual_seed(2025)
+
+    # Create tensor and return the pointer, tensor, and stride
+    def create_tensor_and_stride(
+        l: int,
+        mode0: int,
+        mode1: int,
+        is_mode0_major: bool,
+        dtype: type[cutlass.Numeric],
+        is_dynamic_layout: bool = True,
+    ) -> tuple[int, torch.Tensor, cute.Tensor, torch.Tensor, tuple[int, int]]:
+        # is_mode0_major: (l, mode1, mode0) -> (mode0, mode1, l)
+        # else: (l, mode0, mode1) -> (mode0, mode1, l)
+        shape = (l, mode1, mode0) if is_mode0_major else (l, mode0, mode1)
+        permute_order = (2, 1, 0) if is_mode0_major else (1, 2, 0)
+        # omit stride for L mode as it is always 1 for grouped GEMM
+        strides = (1, mode0) if is_mode0_major else (mode1, 1)
+        assert dtype in {cutlass.Float16, cutlass.BFloat16, cutlass.Float32}
+        is_unsigned = False
+
+        torch_dtype = cutlass_torch.dtype(dtype)
+        torch_tensor_cpu = cutlass_torch.create_and_permute_torch_tensor(
+            shape,
+            torch_dtype,
+            permute_order=permute_order,
+            init_type=cutlass_torch.TensorInitType.RANDOM,
+            init_config=cutlass_torch.RandomInitConfig(
+                min_val=0 if is_unsigned else -2, max_val=4 if is_unsigned else 2
+            ),
+        )
+        torch_tensor = torch_tensor_cpu.cuda()
+        f32_torch_tensor = torch_tensor_cpu.to(dtype=torch.float32)
+
+        cute_tensor = from_dlpack(torch_tensor, assumed_align=16)
+        if is_dynamic_layout:
+            cute_tensor = cute_tensor.mark_layout_dynamic(
+                leading_dim=(0 if is_mode0_major else 1)
+            )
+        cute_tensor = cutlass_torch.convert_cute_tensor(
+            f32_torch_tensor,
+            cute_tensor,
+            dtype,
+            is_dynamic_layout=is_dynamic_layout,
+        )
+        # Get pointer of the tensor
+        ptr = torch_tensor.data_ptr()
+        return ptr, torch_tensor, cute_tensor, f32_torch_tensor, strides
+
+    # iterate all groups and create tensors for each group
+    torch_fp32_tensors_abc = []
+    torch_tensors_abc = []
+    cute_tensors_abc = []
+    strides_abc = []
+    ptrs_abc = []
+    for _, (m, n, k, l) in enumerate(problem_sizes_mnkl):
+        ptr_a, torch_tensor_a, cute_tensor_a, tensor_fp32_a, stride_mk_a = (
+            create_tensor_and_stride(l, m, k, a_major == "m", ab_dtype)
+        )
+        ptr_b, torch_tensor_b, cute_tensor_b, tensor_fp32_b, stride_nk_b = (
+            create_tensor_and_stride(l, n, k, b_major == "n", ab_dtype)
+        )
+        ptr_c, torch_tensor_c, cute_tensor_c, tensor_fp32_c, stride_mn_c = (
+            create_tensor_and_stride(l, m, n, c_major == "m", c_dtype)
+        )
+        ptrs_abc.append([ptr_a, ptr_b, ptr_c])
+        torch_tensors_abc.append([torch_tensor_a, torch_tensor_b, torch_tensor_c])
+        torch_fp32_tensors_abc.append([tensor_fp32_a, tensor_fp32_b, tensor_fp32_c])
+        strides_abc.append([stride_mk_a, stride_nk_b, stride_mn_c])
+        cute_tensors_abc.append(
+            (
+                cute_tensor_a,
+                cute_tensor_b,
+                cute_tensor_c,
+            )
+        )
+    # Choose A, B, C with the smallest size to create initial tensormaps
+    key_size_a = lambda item: item[1][0] * item[1][2]
+    key_size_b = lambda item: item[1][1] * item[1][2]
+    key_size_c = lambda item: item[1][0] * item[1][1]
+    # Find the indices of the groups with the smallest tensor sizes
+    min_a_idx, _ = min(enumerate(problem_sizes_mnkl), key=key_size_a)
+    min_b_idx, _ = min(enumerate(problem_sizes_mnkl), key=key_size_b)
+    min_c_idx, _ = min(enumerate(problem_sizes_mnkl), key=key_size_c)
+    initial_cute_tensors_abc = [
+        cute_tensors_abc[min_a_idx][0],  # A with smallest (m, k)
+        cute_tensors_abc[min_b_idx][1],  # B with smallest (n, k)
+        cute_tensors_abc[min_c_idx][2],  # C with smallest (m, n)
+    ]
+
+    hardware_info = utils.HardwareInfo()
+    sm_count = hardware_info.get_max_active_clusters(1)
+    max_active_clusters = hardware_info.get_max_active_clusters(
+        cluster_shape_mn[0] * cluster_shape_mn[1]
+    )
+    # Prepare tensormap buffer for each SM
+    num_tensormap_buffers = sm_count
+    tensormap_pytorch_tensor = (
+        torch.empty(
+            (
+                num_tensormap_buffers,
+                GroupedGemmKernel.num_tensormaps,
+                GroupedGemmKernel.bytes_per_tensormap // 8,
+            ),
+            dtype=torch.int64,
+        )
+        .fill_(0)
+        .cuda()
+    )
+    tensormap_cute_tensor = from_dlpack(tensormap_pytorch_tensor, assumed_align=16)
+
+    grouped_gemm = GroupedGemmKernel(
+        acc_dtype,
+        use_2cta_instrs,
+        mma_tiler_mn,
+        cluster_shape_mn,
+        tensormap_update_mode,
+    )
+
+    # Convert integer list to torch tensor and cute tensor
+    def convert_list_to_tensor(l, dtype) -> tuple[torch.Tensor, cute.Tensor]:
+        torch_tensor = torch.tensor(l, dtype=dtype).cuda()
+        cute_tensor = from_dlpack(torch_tensor, assumed_align=16)
+        return torch_tensor, cute_tensor
+
+    # layout (num_groups, 4):(4, 1)
+    problem_sizes_mnkl_torch_tensor, problem_sizes_mnkl_cute_tensor = (
+        convert_list_to_tensor(problem_sizes_mnkl, torch.int32)
+    )
+    # layout (num_groups, 3, 2):(6, 2, 1)
+    strides_abc_torch_tensor, strides_abc_cute_tensor = convert_list_to_tensor(
+        strides_abc, torch.int32
+    )
+    # layout (num_groups,3):(3, 1)
+    ptrs_abc_torch_tensor, ptrs_abc_cute_tensor = convert_list_to_tensor(
+        ptrs_abc, torch.int64
+    )
+
+    # Compute total number of cluster tiles we need to compute for given grouped GEMM problem
+    def compute_total_num_clusters(
+        problem_sizes_mnkl: List[tuple[int, int, int, int]],
+        cluster_tile_shape_mn: tuple[int, int],
+    ) -> int:
+        total_num_clusters = 0
+        for m, n, _, _ in problem_sizes_mnkl:
+            num_clusters_mn = tuple(
+                (x + y - 1) // y for x, y in zip((m, n), cluster_tile_shape_mn)
+            )
+            total_num_clusters += functools.reduce(lambda x, y: x * y, num_clusters_mn)
+        return total_num_clusters
+
+    # Compute cluster tile shape
+    def compute_cluster_tile_shape(
+        mma_tiler_mn: tuple[int, int],
+        cluster_shape_mn: tuple[int, int],
+        use_2cta_instrs: bool,
+    ) -> tuple[int, int]:
+        cta_tile_shape_mn = list(mma_tiler_mn)
+        if use_2cta_instrs:
+            cta_tile_shape_mn[0] = cta_tile_shape_mn[0] // 2
+        return tuple(x * y for x, y in zip(cta_tile_shape_mn, cluster_shape_mn))
+
+    cluster_tile_shape_mn = compute_cluster_tile_shape(
+        mma_tiler_mn, cluster_shape_mn, use_2cta_instrs
+    )
+    total_num_clusters = compute_total_num_clusters(
+        problem_sizes_mnkl, cluster_tile_shape_mn
+    )
+
+    # Get current CUDA stream from PyTorch
+    torch_stream = torch.cuda.current_stream()
+    # Get the raw stream pointer as a CUstream
+    current_stream = cuda.CUstream(torch_stream.cuda_stream)
+    # Compile grouped GEMM kernel
+    compiled_grouped_gemm = cute.compile(
+        grouped_gemm,
+        initial_cute_tensors_abc[0],
+        initial_cute_tensors_abc[1],
+        initial_cute_tensors_abc[2],
+        num_groups,
+        problem_sizes_mnkl_cute_tensor,
+        strides_abc_cute_tensor,
+        ptrs_abc_cute_tensor,
+        total_num_clusters,
+        tensormap_cute_tensor,
+        max_active_clusters,
+        current_stream,
+    )
+
+    # Launch GPU kernel
+    # Warm up
+    for _ in range(warmup_iterations):
+        compiled_grouped_gemm(
+            initial_cute_tensors_abc[0],
+            initial_cute_tensors_abc[1],
+            initial_cute_tensors_abc[2],
+            problem_sizes_mnkl_cute_tensor,
+            strides_abc_cute_tensor,
+            ptrs_abc_cute_tensor,
+            tensormap_cute_tensor,
+            current_stream,
+        )
+    # Execution
+    for i in range(iterations):
+        compiled_grouped_gemm(
+            initial_cute_tensors_abc[0],
+            initial_cute_tensors_abc[1],
+            initial_cute_tensors_abc[2],
+            problem_sizes_mnkl_cute_tensor,
+            strides_abc_cute_tensor,
+            ptrs_abc_cute_tensor,
+            tensormap_cute_tensor,
+            current_stream,
+        )
+
+    # Compute reference result
+    if not skip_ref_check:
+        refs = []
+        for a, b, _ in torch_fp32_tensors_abc:
+            ref = (torch.einsum("mkl,nkl->mnl", a, b)).cpu()
+            refs.append(ref)
+        for i, ((_, _, c), ref) in enumerate(zip(torch_tensors_abc, refs)):
+            print(f"checking group {i}")
+            if c_dtype == cutlass.Float32:
+                ref_c = ref
+            else:
+                ref_c = ref.to(cutlass_torch.dtype(c_dtype))
+            torch.testing.assert_close(
+                c.cpu(),
+                ref_c,
+                atol=tolerance,
+                rtol=1e-05,
+            )
+
+
+if __name__ == "__main__":
+
+    def parse_comma_separated_ints(s: str) -> tuple[int, ...]:
+        try:
+            return tuple(int(x.strip()) for x in s.split(","))
+        except ValueError:
+            raise argparse.ArgumentTypeError(
+                "Invalid format. Expected comma-separated integers."
+            )
+
+    def parse_comma_separated_tuples(s: str) -> List[tuple[int, ...]]:
+        if s.strip().startswith("("):
+            # Split on ),( to separate tuples
+            tuples = s.strip("()").split("),(")
+            result = []
+            tuple_len = None
+
+            for t in tuples:
+                # Parse individual tuple
+                nums = [int(x.strip()) for x in t.split(",")]
+
+                # Validate tuple length consistency
+                if tuple_len is None:
+                    tuple_len = len(nums)
+                elif len(nums) != tuple_len:
+                    raise argparse.ArgumentTypeError(
+                        "All tuples must have the same length"
+                    )
+
+                result.append(tuple(nums))
+            return result
+
+        raise argparse.ArgumentTypeError(
+            "Invalid format. Expected comma-separated integers or list of tuples"
+        )
+
+    parser = argparse.ArgumentParser(
+        description="Example of Grouped GEMM on Blackwell."
+    )
+    parser.add_argument(
+        "--num_groups",
+        type=int,
+        default=2,
+        help="Number of groups",
+    )
+    parser.add_argument(
+        "--problem_sizes_mnkl",
+        type=parse_comma_separated_tuples,
+        default=((128, 128, 128, 1), (128, 128, 128, 1)),
+        help="a tuple of problem sizes for each group (comma-separated tuples)",
+    )
+    parser.add_argument(
+        "--mma_tiler_mn",
+        type=parse_comma_separated_ints,
+        default=(128, 128),
+        help="Mma tile shape (comma-separated)",
+    )
+    parser.add_argument(
+        "--cluster_shape_mn",
+        type=parse_comma_separated_ints,
+        default=(1, 1),
+        help="Cluster shape (comma-separated)",
+    )
+    parser.add_argument(
+        "--tensormap_update_mode",
+        type=str,
+        default="SMEM",
+        help="Tensor map update mode",
+    )
+    parser.add_argument("--ab_dtype", type=cutlass.dtype, default=cutlass.Float16)
+    parser.add_argument("--c_dtype", type=cutlass.dtype, default=cutlass.Float16)
+    parser.add_argument("--acc_dtype", type=cutlass.dtype, default=cutlass.Float32)
+    parser.add_argument(
+        "--use_2cta_instrs",
+        action="store_true",
+        help="Enable 2CTA MMA instructions feature",
+    )
+    parser.add_argument("--a_major", choices=["k", "m"], type=str, default="k")
+    parser.add_argument("--b_major", choices=["k", "n"], type=str, default="k")
+    parser.add_argument("--c_major", choices=["n", "m"], type=str, default="n")
+    parser.add_argument(
+        "--tolerance", type=float, default=1e-01, help="Tolerance for validation"
+    )
+    parser.add_argument(
+        "--warmup_iterations", type=int, default=0, help="Warmup iterations"
+    )
+    parser.add_argument(
+        "--iterations",
+        type=int,
+        default=1,
+        help="Number of iterations to run the kernel",
+    )
+    parser.add_argument(
+        "--skip_ref_check", action="store_true", help="Skip reference checking"
+    )
+
+    args = parser.parse_args()
+
+    if (
+        len(args.problem_sizes_mnkl) != 0
+        and len(args.problem_sizes_mnkl) != args.num_groups
+    ):
+        parser.error("--problem_sizes_mnkl must contain exactly num_groups tuples")
+
+    # l mode must be 1 for all groups
+    for _, _, _, l in args.problem_sizes_mnkl:
+        if l != 1:
+            parser.error("l must be 1 for all groups")
+
+    if len(args.mma_tiler_mn) != 2:
+        parser.error("--mma_tiler_mn must contain exactly 2 values")
+
+    if len(args.cluster_shape_mn) != 2:
+        parser.error("--cluster_shape_mn must contain exactly 2 values")
+
+    if args.tensormap_update_mode not in ["GMEM", "SMEM"]:
+        parser.error("--tensormap_update_mode must be GMEM or SMEM")
+
+    if args.tensormap_update_mode == "GMEM":
+        tensormap_update_mode = utils.TensorMapUpdateMode.GMEM
+    else:
+        tensormap_update_mode = utils.TensorMapUpdateMode.SMEM
+
+    run_grouped_gemm(
+        args.num_groups,
+        args.problem_sizes_mnkl,
+        args.ab_dtype,
+        args.c_dtype,
+        args.acc_dtype,
+        args.a_major,
+        args.b_major,
+        args.c_major,
+        args.mma_tiler_mn,
+        args.cluster_shape_mn,
+        args.use_2cta_instrs,
+        tensormap_update_mode,
+        args.tolerance,
+        args.warmup_iterations,
+        args.iterations,
+        args.skip_ref_check,
+    )
+    print("PASS")

From 97bf42beb3c8827da567b0ab4ecf1f979c16762b Mon Sep 17 00:00:00 2001
From: lessw2020 <lessw@etrillium.com>
Date: Sun, 8 Jun 2025 09:22:37 -0700
Subject: [PATCH 02/34] add benchmark showcasing pytorch integration

---
 .../blackwell/benchmark_cute_grouped_gemm.py  | 537 ++++++++++++++++++
 1 file changed, 537 insertions(+)
 create mode 100644 torchtitan/experiments/kernels/blackwell/benchmark_cute_grouped_gemm.py

diff --git a/torchtitan/experiments/kernels/blackwell/benchmark_cute_grouped_gemm.py b/torchtitan/experiments/kernels/blackwell/benchmark_cute_grouped_gemm.py
new file mode 100644
index 000000000..b6fdaab31
--- /dev/null
+++ b/torchtitan/experiments/kernels/blackwell/benchmark_cute_grouped_gemm.py
@@ -0,0 +1,537 @@
+#!/usr/bin/env python3
+"""
+Grouped GEMM Benchmark using Triton's do_bench
+
+Compares CUTLASS GroupedGemmKernel against PyTorch manual looping
+with robust timing measurements and various problem size configurations.
+"""
+
+import time
+from typing import Any, Dict, List, Tuple
+
+import torch
+import triton.testing
+
+torch.backends.cuda.matmul.allow_tf32 = True
+
+# Import CUTLASS components
+try:
+    import cuda.bindings.driver as cuda
+    import cutlass
+    import cutlass.cute as cute
+    import cutlass.torch as cutlass_torch
+    import cutlass.utils as utils
+    from cute_grouped_gemm import GroupedGemmKernel
+    from cutlass.cute.runtime import from_dlpack
+
+    HAS_CUTLASS = True
+    print("✓ CUTLASS and GroupedGemmKernel imported successfully")
+except ImportError as e:
+    HAS_CUTLASS = False
+    print(f"✗ CUTLASS import failed: {e}")
+    print("Make sure CUTLASS and cute_grouped_gemm.py are available")
+    exit(1)
+
+
+class GroupedGemmBenchmark:
+    """Wrapper class for CUTLASS GroupedGemmKernel benchmarking."""
+
+    def __init__(self, problem_sizes: List[Tuple[int, int, int, int]]):
+        """
+        Initialize grouped GEMM benchmark.
+
+        Args:
+            problem_sizes: List of (M, N, K, L) tuples defining each GEMM problem
+        """
+        self.problem_sizes = problem_sizes
+        self.num_groups = len(problem_sizes)
+        self.device = torch.device("cuda")
+        self.dtype_torch = torch.float16
+        self.dtype_cutlass = cutlass.Float16
+
+        print(f"Setting up grouped GEMM with {self.num_groups} groups:")
+        for i, (M, N, K, L) in enumerate(problem_sizes):
+            print(f"  Group {i}: C[{M},{N}] = A[{M},{K}] @ B[{K},{N}]")
+
+        # Setup tensors and kernel
+        self._setup_tensors()
+        self._setup_kernel()
+
+    def _create_tensor_with_strides(self, M, N, K):
+        """Create PyTorch tensors and extract their actual strides."""
+        # Create standard PyTorch tensors (row-major by default)
+        A = torch.randn(M, K, dtype=self.dtype_torch, device=self.device)
+        B = torch.randn(K, N, dtype=self.dtype_torch, device=self.device)
+        C = torch.zeros(M, N, dtype=self.dtype_torch, device=self.device)
+
+        # Convert to MNKL format
+        A_mnkl = A.unsqueeze(-1).contiguous()  # (M, K) -> (M, K, 1)
+        B_mnkl = B.transpose(0, 1).unsqueeze(-1).contiguous()  # (K, N) -> (N, K, 1)
+        C_mnkl = C.unsqueeze(-1).contiguous()  # (M, N) -> (M, N, 1)
+
+        # Create CUTE tensors
+        A_cute = from_dlpack(A_mnkl, assumed_align=16)
+        B_cute = from_dlpack(B_mnkl, assumed_align=16)
+        C_cute = from_dlpack(C_mnkl, assumed_align=16)
+
+        # Set CUTE properties
+        A_cute.element_type = self.dtype_cutlass
+        B_cute.element_type = self.dtype_cutlass
+        C_cute.element_type = self.dtype_cutlass
+
+        # Mark layouts as dynamic
+        A_cute = A_cute.mark_layout_dynamic(leading_dim=1)
+        B_cute = B_cute.mark_layout_dynamic(leading_dim=1)
+        C_cute = C_cute.mark_layout_dynamic(leading_dim=1)
+
+        # Extract 2D strides
+        A_strides = A_mnkl.stride()[:2]
+        B_strides = B_mnkl.stride()[:2]
+        C_strides = C_mnkl.stride()[:2]
+
+        return (
+            (A, B, C),
+            (A_cute, B_cute, C_cute),
+            (A_strides, B_strides, C_strides),
+            A_mnkl.data_ptr(),
+            B_mnkl.data_ptr(),
+            C_mnkl.data_ptr(),
+        )
+
+    def _setup_tensors(self):
+        """Setup all tensors and metadata for grouped GEMM."""
+        self.torch_tensors = []
+        self.cute_tensors = []
+        strides = []
+        pointers = []
+
+        # Create tensors for each group
+        for M, N, K, L in self.problem_sizes:
+            torch_abc, cute_abc, stride_abc, ptr_a, ptr_b, ptr_c = (
+                self._create_tensor_with_strides(M, N, K)
+            )
+
+            self.torch_tensors.append(torch_abc)
+            self.cute_tensors.append(cute_abc)
+            strides.append(stride_abc)
+            pointers.append([ptr_a, ptr_b, ptr_c])
+
+        # Convert metadata to tensors
+        problem_sizes_tensor = torch.tensor(
+            self.problem_sizes, dtype=torch.int32, device=self.device
+        )
+        self.problem_sizes_cute = from_dlpack(problem_sizes_tensor, assumed_align=16)
+
+        strides_tensor = torch.tensor(strides, dtype=torch.int32, device=self.device)
+        self.strides_cute = from_dlpack(strides_tensor, assumed_align=16)
+
+        pointers_tensor = torch.tensor(pointers, dtype=torch.int64, device=self.device)
+        self.pointers_cute = from_dlpack(pointers_tensor, assumed_align=16)
+
+        # Create tensormap buffer
+        hardware_info = utils.HardwareInfo()
+        sm_count = hardware_info.get_max_active_clusters(1)
+
+        tensormap_tensor = torch.zeros(
+            (sm_count, 3, 128 // 8),
+            dtype=torch.int64,
+            device=self.device,
+        )
+        self.tensormap_cute = from_dlpack(tensormap_tensor, assumed_align=16)
+
+    def _setup_kernel(self):
+        """Setup and compile the grouped GEMM kernel."""
+        # Create grouped GEMM kernel
+        self.grouped_gemm = GroupedGemmKernel(
+            acc_dtype=cutlass.Float32,
+            use_2cta_instrs=False,
+            mma_tiler_mn=(128, 64),
+            cluster_shape_mn=(1, 1),
+            tensormap_update_mode=utils.TensorMapUpdateMode.SMEM,
+        )
+
+        # Compute grid parameters
+        def compute_total_clusters():
+            cta_tile_m = 128
+            cta_tile_n = 64
+            cluster_m = 1
+            cluster_n = 1
+
+            cluster_tile_m = cta_tile_m * cluster_m
+            cluster_tile_n = cta_tile_n * cluster_n
+
+            total = 0
+            for M, N, K, L in self.problem_sizes:
+                clusters_m = (M + cluster_tile_m - 1) // cluster_tile_m
+                clusters_n = (N + cluster_tile_n - 1) // cluster_tile_n
+                total += clusters_m * clusters_n
+            return total
+
+        self.total_clusters = compute_total_clusters()
+
+        hardware_info = utils.HardwareInfo()
+        self.max_active_clusters = hardware_info.get_max_active_clusters(1)
+
+        # Choose initial tensors (smallest ones for tensormap initialization)
+        sizes = [(M * K, N * K, M * N) for M, N, K, L in self.problem_sizes]
+        min_a_idx = min(range(self.num_groups), key=lambda i: sizes[i][0])
+        min_b_idx = min(range(self.num_groups), key=lambda i: sizes[i][1])
+        min_c_idx = min(range(self.num_groups), key=lambda i: sizes[i][2])
+
+        self.initial_A = self.cute_tensors[min_a_idx][0]
+        self.initial_B = self.cute_tensors[min_b_idx][1]
+        self.initial_C = self.cute_tensors[min_c_idx][2]
+
+        # Setup stream
+        self.torch_stream = torch.cuda.Stream()
+        self.stream = cuda.CUstream(self.torch_stream.cuda_stream)
+
+        # Compile kernel
+        self.compiled_kernel = cute.compile(
+            self.grouped_gemm,
+            self.initial_A,
+            self.initial_B,
+            self.initial_C,
+            self.num_groups,
+            self.problem_sizes_cute,
+            self.strides_cute,
+            self.pointers_cute,
+            self.total_clusters,
+            self.tensormap_cute,
+            self.max_active_clusters,
+            self.stream,
+        )
+
+    def pytorch_manual_loop(self):
+        """Execute PyTorch manual looping through all GEMMs."""
+        results = []
+        for i, (A, B, C) in enumerate(self.torch_tensors):
+            # Reset output tensor
+            C.zero_()
+            # Compute GEMM
+            result = torch.mm(A, B, out=C)
+            results.append(result)
+        return results
+
+    def cutlass_grouped_gemm(self):
+        """Execute CUTLASS grouped GEMM kernel."""
+        # Reset all output tensors
+        for A, B, C in self.torch_tensors:
+            C.zero_()
+
+        # Execute grouped kernel
+        self.compiled_kernel(
+            self.initial_A,
+            self.initial_B,
+            self.initial_C,
+            self.problem_sizes_cute,
+            self.strides_cute,
+            self.pointers_cute,
+            self.tensormap_cute,
+            self.stream,
+        )
+
+        # Return all C tensors
+        return [C for A, B, C in self.torch_tensors]
+
+
+def validate_grouped_gemm(
+    problem_sizes: List[Tuple[int, int, int, int]], tolerance=1e-2
+):
+    """
+    Validate that grouped GEMM produces correct results.
+
+    Args:
+        problem_sizes: List of (M, N, K, L) tuples
+        tolerance: Acceptable relative error
+
+    Returns:
+        bool: True if all results match within tolerance
+    """
+    print(f"\nValidating grouped GEMM with {len(problem_sizes)} groups")
+
+    if not torch.cuda.is_available():
+        print("✗ CUDA not available")
+        return False
+
+    try:
+        benchmark = GroupedGemmBenchmark(problem_sizes)
+
+        # Compute both results
+        pytorch_results = benchmark.pytorch_manual_loop()
+        cutlass_results = benchmark.cutlass_grouped_gemm()
+
+        # Compare results group by group
+        all_correct = True
+        for i, (pytorch_result, cutlass_result) in enumerate(
+            zip(pytorch_results, cutlass_results)
+        ):
+            diff = torch.abs(pytorch_result - cutlass_result)
+            max_diff = torch.max(diff).item()
+            norm_diff = torch.norm(diff).item()
+            norm_ref = torch.norm(pytorch_result).item()
+            rel_error = norm_diff / norm_ref if norm_ref > 0 else float("inf")
+
+            print(f"  Group {i}: max_diff={max_diff:.2e}, rel_error={rel_error:.2e}")
+
+            if rel_error > tolerance:
+                print(f"    ✗ Group {i} failed tolerance check")
+                all_correct = False
+            else:
+                print(f"    ✓ Group {i} passed")
+
+        if all_correct:
+            print(f"  ✓ All groups passed validation")
+        else:
+            print(f"  ✗ Some groups failed validation")
+
+        return all_correct
+
+    except Exception as e:
+        print(f"  ✗ Validation failed: {e}")
+        return False
+
+
+def benchmark_grouped_gemm(
+    problem_sizes: List[Tuple[int, int, int, int]], warmup=3, rep=10
+):
+    """
+    Benchmark grouped GEMM vs PyTorch manual looping.
+
+    Args:
+        problem_sizes: List of (M, N, K, L) tuples
+        warmup: Number of warmup iterations
+        rep: Number of benchmark repetitions
+
+    Returns:
+        dict: Timing results and metrics
+    """
+    print(f"\nBenchmarking grouped GEMM with {len(problem_sizes)} groups")
+
+    if not torch.cuda.is_available():
+        print("✗ CUDA not available")
+        return None
+
+    try:
+        benchmark = GroupedGemmBenchmark(problem_sizes)
+
+        # Calculate total FLOPs
+        total_flops = sum(2 * M * N * K for M, N, K, L in problem_sizes)
+
+        # Benchmark PyTorch manual loop
+        pytorch_time = triton.testing.do_bench(
+            benchmark.pytorch_manual_loop, warmup=warmup, rep=rep
+        )
+
+        # Benchmark CUTLASS grouped GEMM
+        cutlass_time = triton.testing.do_bench(
+            benchmark.cutlass_grouped_gemm, warmup=warmup, rep=rep
+        )
+
+        # Calculate metrics
+        pytorch_tflops = total_flops / (pytorch_time * 1e-3) / 1e12
+        cutlass_tflops = total_flops / (cutlass_time * 1e-3) / 1e12
+        speedup = pytorch_time / cutlass_time
+
+        results = {
+            "num_groups": len(problem_sizes),
+            "total_flops": total_flops,
+            "pytorch_ms": pytorch_time,
+            "cutlass_ms": cutlass_time,
+            "pytorch_tflops": pytorch_tflops,
+            "cutlass_tflops": cutlass_tflops,
+            "speedup": speedup,
+            "problem_sizes": problem_sizes,
+        }
+
+        print(
+            f"  PyTorch manual loop: {pytorch_time:.2f} ms ({pytorch_tflops:.2f} TFLOPS)"
+        )
+        print(f"  CUTLASS grouped: {cutlass_time:.2f} ms ({cutlass_tflops:.2f} TFLOPS)")
+        print(f"  Speedup: {speedup:.2f}x")
+
+        return results
+
+    except Exception as e:
+        print(f"  ✗ Benchmark failed: {e}")
+        return None
+
+
+def generate_problem_sets():
+    """Generate different sets of problem sizes for comprehensive testing."""
+
+    problem_sets = {
+        "small_uniform": [(256, 256, 256, 1) for _ in range(8)],
+        "medium_uniform": [(512, 512, 512, 1) for _ in range(4)],
+        "large_uniform": [(1024, 1024, 1024, 1) for _ in range(2)],
+        "mixed_sizes": [
+            (256, 256, 256, 1),
+            (512, 512, 512, 1),
+            (1024, 1024, 512, 1),
+            (768, 384, 256, 1),
+            (384, 768, 256, 1),
+        ],
+        "skinny_matrices": [
+            (2048, 128, 256, 1),
+            (1024, 256, 512, 1),
+            (512, 512, 1024, 1),
+            (256, 1024, 512, 1),
+        ],
+        "fat_matrices": [
+            (128, 2048, 256, 1),
+            (256, 1024, 512, 1),
+            (512, 512, 1024, 1),
+            (1024, 256, 512, 1),
+        ],
+        "many_small": [(128, 128, 128, 1) for _ in range(16)],
+        "few_large": [
+            (2048, 2048, 1024, 1),
+            (1536, 1536, 768, 1),
+        ],
+    }
+
+    return problem_sets
+
+
+def run_grouped_gemm_benchmark_suite():
+    """Run comprehensive grouped GEMM benchmark suite."""
+    print("CUTLASS Grouped GEMM Benchmark Suite")
+    print("Comparing GroupedGemmKernel vs PyTorch Manual Looping")
+    print("Using Triton's do_bench for accurate timing")
+    print("=" * 80)
+
+    if not HAS_CUTLASS:
+        print("✗ CUTLASS not available")
+        return 1
+
+    problem_sets = generate_problem_sets()
+
+    # Validation phase
+    print("\n" + "=" * 80)
+    print("VALIDATION PHASE")
+    print("=" * 80)
+
+    validation_results = []
+    tolerance = 1  # 1e-2
+
+    for set_name, problem_sizes in problem_sets.items():
+        print(f"\nValidating problem set: {set_name}")
+        success = validate_grouped_gemm(problem_sizes, tolerance=tolerance)
+        validation_results.append((set_name, success))
+
+    # Benchmark phase
+    print("\n" + "=" * 80)
+    print("BENCHMARK PHASE")
+    print("=" * 80)
+
+    benchmark_results = []
+
+    for set_name, problem_sizes in problem_sets.items():
+        # Only benchmark if validation passed
+        validation_success = next(
+            success for name, success in validation_results if name == set_name
+        )
+
+        if validation_success:
+            print(f"\nBenchmarking problem set: {set_name}")
+            result = benchmark_grouped_gemm(problem_sizes, warmup=3, rep=10)
+            if result:
+                result["set_name"] = set_name
+                benchmark_results.append(result)
+        else:
+            print(f"\nSkipping benchmark for {set_name} (validation failed)")
+
+    # Summary
+    print("\n" + "=" * 80)
+    print("VALIDATION SUMMARY")
+    print("=" * 80)
+
+    passed = sum(1 for _, success in validation_results if success)
+    total = len(validation_results)
+
+    for set_name, success in validation_results:
+        status = "✓ PASS" if success else "✗ FAIL"
+        print(f"  {set_name:<15}: {status}")
+
+    print(f"\nValidation: {passed}/{total} problem sets passed")
+
+    if benchmark_results:
+        print("\n" + "=" * 80)
+        print("PERFORMANCE SUMMARY")
+        print("=" * 80)
+        print(
+            f"{'Problem Set':<15} {'Groups':<8} {'PyTorch':<10} {'CUTLASS':<10} {'Speedup (x)':<8} {'Best TFLOPS':<10}"
+        )
+        print("-" * 80)
+
+        total_speedup = 0
+        max_tflops = 0
+
+        for result in benchmark_results:
+            set_name = result["set_name"]
+            num_groups = result["num_groups"]
+            pytorch_ms = result["pytorch_ms"]
+            cutlass_ms = result["cutlass_ms"]
+            speedup = result["speedup"]
+            best_tflops = max(result["pytorch_tflops"], result["cutlass_tflops"])
+
+            print(
+                f"{set_name:<15} {num_groups:<8} {pytorch_ms:<8.3f} {cutlass_ms:<12.3f} "
+                f"{speedup:<9.2f} {best_tflops:<11.2f}"
+            )
+
+            total_speedup += speedup
+            max_tflops = max(max_tflops, best_tflops)
+
+        avg_speedup = total_speedup / len(benchmark_results)
+        print(f"\nAverage speedup: {avg_speedup:.2f}x")
+        print(f"Peak performance: {max_tflops:.1f} TFLOPS")
+
+        # Analysis
+        print("\n" + "=" * 80)
+        print("ANALYSIS")
+        print("=" * 80)
+
+        # Find best and worst performing configurations
+        best_speedup = max(benchmark_results, key=lambda x: x["speedup"])
+        worst_speedup = min(benchmark_results, key=lambda x: x["speedup"])
+        best_tflops_result = max(
+            benchmark_results,
+            key=lambda x: max(x["pytorch_tflops"], x["cutlass_tflops"]),
+        )
+
+        print(
+            f"Best speedup: {best_speedup['speedup']:.2f}x ({best_speedup['set_name']})"
+        )
+        print(
+            f"Worst speedup: {worst_speedup['speedup']:.2f}x ({worst_speedup['set_name']})"
+        )
+        print(
+            f"Best TFLOPS: {max(best_tflops_result['pytorch_tflops'], best_tflops_result['cutlass_tflops']):.1f} ({best_tflops_result['set_name']})"
+        )
+
+        # Efficiency analysis
+        efficient_sets = [r for r in benchmark_results if r["speedup"] > 1.5]
+        print(
+            f"\nHigh-efficiency problem sets (>1.5x speedup): {len(efficient_sets)}/{len(benchmark_results)}"
+        )
+
+        for result in efficient_sets:
+            print(f"  {result['set_name']}: {result['speedup']:.2f}x speedup")
+
+    # Final status
+    if passed == total and benchmark_results:
+        print("\n🎉 All validation tests passed and benchmarks completed successfully!")
+        print(
+            "Grouped GEMM shows significant benefits for batch processing multiple smaller GEMMs!"
+        )
+        return 0
+    else:
+        print(
+            f"\n⚠ {passed}/{total} validation tests passed, benchmarks may be incomplete"
+        )
+        return 1
+
+
+if __name__ == "__main__":
+    exit(run_grouped_gemm_benchmark_suite())

From 79b2a62cf9e472116562cdbdd4b74136785b4d94 Mon Sep 17 00:00:00 2001
From: lessw2020 <lessw@etrillium.com>
Date: Sun, 8 Jun 2025 20:04:20 -0700
Subject: [PATCH 03/34] add groupGemm cute strategy

---
 .../experiments/deepseek_v3/group_gemms.py    | 467 +++++++++++++
 .../experiments/deepseek_v3/test_moe.py       | 630 ++++++++++++++++++
 2 files changed, 1097 insertions(+)
 create mode 100644 torchtitan/experiments/deepseek_v3/test_moe.py

diff --git a/torchtitan/experiments/deepseek_v3/group_gemms.py b/torchtitan/experiments/deepseek_v3/group_gemms.py
index f4020dee2..ca93fffa6 100644
--- a/torchtitan/experiments/deepseek_v3/group_gemms.py
+++ b/torchtitan/experiments/deepseek_v3/group_gemms.py
@@ -47,6 +47,27 @@
 except ImportError:
     TRITON_CONTIGUOUS_GROUP_GEMM_AVAILABLE = False
 
+try:
+    import cuda.bindings.driver as cuda
+    import cutlass
+    import cutlass.cute as cute
+    import cutlass.torch as cutlass_torch
+    import cutlass.utils as utils
+
+    # Import our strategy - UPDATE PATH AS NEEDED
+
+    from cutlass.cute.runtime import from_dlpack
+    from torchtitan.experiments.kernels.blackwell.cute_grouped_gemm import (
+        GroupedGemmKernel,
+    )
+
+    HAS_CUTLASS = True
+    print("✓ CUTLASS and strategies imported successfully")
+except ImportError as e:
+    HAS_CUTLASS = False
+    print(f"✗ Import failed: {e}")
+    print("Using PyTorch fallback implementations only")
+
 
 # Strategy base class for GroupGEMM implementations
 class GroupGEMMStrategy:
@@ -97,9 +118,455 @@ def is_available() -> bool:
     "TorchBF16GroupGEMM",
     "TorchAOBF16GroupGEMM",
     "TritonCGBF16GroupGEMM",
+    "CUTLASSGroupedGemmStrategy",
+    "ManualLoopGroupGEMM",
 ]
 
 
+class ManualLoopGroupGEMM(GroupGEMMStrategy):
+    """Manual looping baseline implementation for any arch (esp Blackwell) support"""
+
+    def arrange_expert_weights(self, all_weights, submod_name, module):
+        """Store weights in a stacked format"""
+        return torch.stack(all_weights)
+
+    def execute(self, contig_tokens, m_sizes, m_offsets, module):
+        """Execute using manual loops over experts"""
+        # Get weights
+
+        w_gate = module.get_parameter("gate_proj_weight")
+        w_up = module.get_parameter("up_proj_weight")
+        w_down = module.get_parameter("down_proj_weight")
+
+        # Prepare output tensor
+        hidden_size = w_gate.shape[
+            2
+        ]  # stacked weights shape [num_experts, out_dim, in_dim]
+        output = torch.zeros(
+            contig_tokens.shape[0],
+            hidden_size,
+            dtype=contig_tokens.dtype,
+            device=contig_tokens.device,
+        )
+
+        # Process each expert sequentially
+        offset = 0
+        for expert_idx, size in enumerate(m_sizes):
+            if size > 0:
+                # Get tokens for this expert
+                expert_tokens = contig_tokens[offset : offset + size]
+
+                # Get weights for this expert
+                gate_weight = w_gate[expert_idx]  # [out_dim, in_dim]
+                up_weight = w_up[expert_idx]
+                down_weight = w_down[expert_idx]
+
+                # Forward pass: gate and up projections
+                gate_out = torch.mm(expert_tokens, gate_weight.t())
+                up_out = torch.mm(expert_tokens, up_weight.t())
+
+                # Apply activation and combine
+                hidden = self.activation_function(gate_out) * up_out
+
+                # Down projection
+                expert_output = torch.mm(hidden, down_weight.t())
+
+                # Store results
+                output[offset : offset + size] = expert_output
+
+            offset += size
+
+        return output
+
+    @staticmethod
+    def is_available() -> bool:
+        return True
+
+
+class CUTLASSGroupedGemmStrategy(GroupGEMMStrategy):
+    """
+    Strategy using CUTLASS GroupedGemmKernel for group GEMM operations
+
+    """
+
+    def __init__(self, custom_activation):
+        super().__init__(custom_activation)
+        self.dtype_torch = torch.bfloat16
+        self.dtype_cutlass = cutlass.BFloat16
+        self.acc_dtype = cutlass.Float32
+        self.alignment = 16
+
+        # Create grouped GEMM kernel
+        self.grouped_gemm = GroupedGemmKernel(
+            acc_dtype=self.acc_dtype,
+            use_2cta_instrs=False,
+            mma_tiler_mn=(128, 64),
+            cluster_shape_mn=(1, 1),
+            tensormap_update_mode=utils.TensorMapUpdateMode.SMEM,
+        )
+
+        # Setup hardware info and stream
+        self.hardware_info = utils.HardwareInfo()
+        self.max_active_clusters = self.hardware_info.get_max_active_clusters(1)
+
+        torch_stream = torch.cuda.current_stream()
+        self.stream = cuda.CUstream(torch_stream.cuda_stream)
+
+        # Cache for compiled kernels and tensormap buffers
+        self._compiled_kernels = {}
+        self._tensormap_buffers = {}
+
+        print("Initialized CUTLASSGroupedGemmStrategy with GroupedGemmKernel")
+
+    def arrange_expert_weights(self, all_weights, submod_name, module):
+        """Store weights in stacked format"""
+        return torch.stack(all_weights)
+
+    def execute(self, contig_tokens, m_sizes, m_offsets, module):
+        """Execute using CUTLASS grouped GEMM kernel"""
+        # Get weights
+        w_gate = module.get_parameter("gate_proj_weight")
+        w_up = module.get_parameter("up_proj_weight")
+        w_down = module.get_parameter("down_proj_weight")
+
+        device = contig_tokens.device
+        hidden_size = w_gate.shape[2]
+
+        # Prepare output tensor
+        output = torch.zeros(
+            contig_tokens.shape[0], hidden_size, dtype=self.dtype_torch, device=device
+        )
+
+        # Filter valid experts
+        valid_experts = [(i, size) for i, size in enumerate(m_sizes) if size > 0]
+        if not valid_experts:
+            return output
+
+        # Step 1: Execute gate and up projections using grouped GEMM
+        gate_outputs, up_outputs = self._execute_gate_up_projections(
+            contig_tokens, w_gate, w_up, m_sizes, device
+        )
+
+        # Step 2: Apply activation and combine
+        hidden_states = self._apply_activation_and_combine(
+            gate_outputs, up_outputs, m_sizes
+        )
+
+        # Step 3: Execute down projection using grouped GEMM
+        final_outputs = self._execute_down_projection(
+            hidden_states, w_down, m_sizes, device
+        )
+
+        # Step 4: Reconstruct full output
+        return self._reconstruct_output(final_outputs, m_sizes, output)
+
+    def _execute_gate_up_projections(
+        self, contig_tokens, w_gate, w_up, m_sizes, device
+    ):
+        """Execute gate and up projections using grouped GEMM"""
+        # Prepare tensors and metadata for gate and up projections
+        problem_sizes = []
+        strides_abc = []
+        ptrs_abc = []
+        gate_outputs = []
+        up_outputs = []
+
+        offset = 0
+        for expert_idx, size in enumerate(m_sizes):
+            if size > 0:
+                # Get expert tokens
+                expert_tokens = contig_tokens[offset : offset + size].contiguous()
+                gate_weight = w_gate[
+                    expert_idx
+                ].contiguous()  # [intermediate_size, hidden_size]
+                up_weight = w_up[expert_idx].contiguous()
+
+                M, K = expert_tokens.shape
+                N = gate_weight.shape[0]  # intermediate_size
+                L = 1
+
+                # Create output tensors for gate and up projections
+                gate_output = torch.empty(M, N, dtype=self.dtype_torch, device=device)
+                up_output = torch.empty(M, N, dtype=self.dtype_torch, device=device)
+
+                # Convert to MNKL format (following bench_group_gemm.py)
+                expert_tokens_mnkl = expert_tokens.unsqueeze(
+                    -1
+                ).contiguous()  # (M, K, 1)
+                gate_weight_mnkl = gate_weight.unsqueeze(-1).contiguous()  # (N, K, 1)
+                up_weight_mnkl = up_weight.unsqueeze(-1).contiguous()  # (N, K, 1)
+                gate_output_mnkl = gate_output.unsqueeze(-1).contiguous()  # (M, N, 1)
+                up_output_mnkl = up_output.unsqueeze(-1).contiguous()  # (M, N, 1)
+
+                # Extract 2D strides from MNKL tensors (following bench_group_gemm.py)
+                expert_tokens_strides = expert_tokens_mnkl.stride()[:2]
+                gate_weight_strides = gate_weight_mnkl.stride()[:2]
+                up_weight_strides = up_weight_mnkl.stride()[:2]
+                gate_output_strides = gate_output_mnkl.stride()[:2]
+                up_output_strides = up_output_mnkl.stride()[:2]
+
+                # Gate projection metadata
+                problem_sizes.append([M, N, K, L])
+                strides_abc.append(
+                    [
+                        list(expert_tokens_strides),  # A strides
+                        list(gate_weight_strides),  # B strides
+                        list(gate_output_strides),  # C strides
+                    ]
+                )
+                ptrs_abc.append(
+                    [
+                        expert_tokens.data_ptr(),
+                        gate_weight.data_ptr(),
+                        gate_output.data_ptr(),
+                    ]
+                )
+
+                # Up projection metadata
+                problem_sizes.append([M, N, K, L])
+                strides_abc.append(
+                    [
+                        list(expert_tokens_strides),  # A strides
+                        list(up_weight_strides),  # B strides
+                        list(up_output_strides),  # C strides
+                    ]
+                )
+                ptrs_abc.append(
+                    [
+                        expert_tokens.data_ptr(),
+                        up_weight.data_ptr(),
+                        up_output.data_ptr(),
+                    ]
+                )
+
+                gate_outputs.append(gate_output)
+                up_outputs.append(up_output)
+
+            offset += size
+
+        if not problem_sizes:
+            return [], []
+
+        # Execute grouped GEMM for gate and up projections
+        self._execute_grouped_gemm(problem_sizes, strides_abc, ptrs_abc, device)
+
+        return gate_outputs, up_outputs
+
+    def _execute_down_projection(self, hidden_states, w_down, m_sizes, device):
+        """Execute down projection using grouped GEMM"""
+        problem_sizes = []
+        strides_abc = []
+        ptrs_abc = []
+        down_outputs = []
+
+        expert_idx = 0
+        for size in m_sizes:
+            if size > 0 and expert_idx < len(hidden_states):
+                hidden = hidden_states[expert_idx]
+                down_weight = w_down[
+                    expert_idx
+                ].contiguous()  # [hidden_size, intermediate_size]
+
+                M, K = hidden.shape
+                N = down_weight.shape[0]  # hidden_size
+                L = 1
+
+                # Create output tensor
+                down_output = torch.empty(M, N, dtype=self.dtype_torch, device=device)
+
+                # Convert to MNKL format (following bench_group_gemm.py)
+                hidden_mnkl = hidden.unsqueeze(-1).contiguous()  # (M, K, 1)
+                down_weight_mnkl = down_weight.unsqueeze(-1).contiguous()  # (N, K, 1)
+                down_output_mnkl = down_output.unsqueeze(-1).contiguous()  # (M, N, 1)
+
+                # Extract 2D strides from MNKL tensors (following bench_group_gemm.py)
+                hidden_strides = hidden_mnkl.stride()[:2]
+                down_weight_strides = down_weight_mnkl.stride()[:2]
+                down_output_strides = down_output_mnkl.stride()[:2]
+
+                # Down projection metadata
+                problem_sizes.append([M, N, K, L])
+                strides_abc.append(
+                    [
+                        list(hidden_strides),  # A strides
+                        list(down_weight_strides),  # B strides
+                        list(down_output_strides),  # C strides
+                    ]
+                )
+                ptrs_abc.append(
+                    [hidden.data_ptr(), down_weight.data_ptr(), down_output.data_ptr()]
+                )
+
+                down_outputs.append(down_output)
+                expert_idx += 1
+
+        if not problem_sizes:
+            return []
+
+        # Execute grouped GEMM for down projection
+        self._execute_grouped_gemm(problem_sizes, strides_abc, ptrs_abc, device)
+
+        return down_outputs
+
+    def _execute_grouped_gemm(self, problem_sizes, strides_abc, ptrs_abc, device):
+        """Execute the grouped GEMM kernel"""
+        num_groups = len(problem_sizes)
+
+        # Convert to tensors
+        problem_sizes_tensor = torch.tensor(
+            problem_sizes, dtype=torch.int32, device=device
+        )
+        strides_tensor = torch.tensor(strides_abc, dtype=torch.int32, device=device)
+        ptrs_tensor = torch.tensor(ptrs_abc, dtype=torch.int64, device=device)
+
+        # Convert to CUTE tensors
+        problem_sizes_cute = from_dlpack(
+            problem_sizes_tensor, assumed_align=self.alignment
+        )
+        strides_cute = from_dlpack(strides_tensor, assumed_align=self.alignment)
+        ptrs_cute = from_dlpack(ptrs_tensor, assumed_align=self.alignment)
+
+        # Setup tensormap buffer
+        tensormap_cute = self._get_tensormap_buffer(device)
+
+        # Compute total clusters
+        total_clusters = self._compute_total_clusters(problem_sizes)
+
+        # Create initial tensors for kernel compilation (use first problem for shapes)
+        initial_A, initial_B, initial_C = self._create_initial_tensors(
+            problem_sizes[0], device
+        )
+
+        # Get or compile kernel
+        cache_key = (num_groups, total_clusters)
+        if cache_key not in self._compiled_kernels:
+            print(f"Compiling grouped GEMM kernel for {num_groups} groups")
+            self._compiled_kernels[cache_key] = cute.compile(
+                self.grouped_gemm,
+                initial_A,
+                initial_B,
+                initial_C,
+                num_groups,
+                problem_sizes_cute,
+                strides_cute,
+                ptrs_cute,
+                total_clusters,
+                tensormap_cute,
+                self.max_active_clusters,
+                self.stream,
+            )
+            print("Kernel compilation successful")
+
+        # Execute kernel
+        compiled_kernel = self._compiled_kernels[cache_key]
+        compiled_kernel(
+            initial_A,
+            initial_B,
+            initial_C,
+            problem_sizes_cute,
+            strides_cute,
+            ptrs_cute,
+            tensormap_cute,
+            self.stream,
+        )
+
+        # Synchronize to ensure completion
+        torch.cuda.synchronize()
+
+    def _create_initial_tensors(self, problem_shape, device):
+        """Create initial CUTE tensors for kernel compilation"""
+        M, N, K, L = problem_shape
+
+        # Create tensors with the right shapes
+        # A: tokens [M, K], B: weights [N, K], C: output [M, N]
+        A_init = torch.randn(M, K, dtype=self.dtype_torch, device=device)
+        B_init = torch.randn(
+            N, K, dtype=self.dtype_torch, device=device
+        )  # Already (N, K) format
+        C_init = torch.zeros(M, N, dtype=self.dtype_torch, device=device)
+
+        # Convert to MNKL format
+        A_mnkl = A_init.unsqueeze(-1).contiguous()  # (M, K) -> (M, K, 1)
+        B_mnkl = B_init.unsqueeze(
+            -1
+        ).contiguous()  # (N, K) -> (N, K, 1) - no transpose needed
+        C_mnkl = C_init.unsqueeze(-1).contiguous()  # (M, N) -> (M, N, 1)
+
+        # Create CUTE tensors
+        A_cute = from_dlpack(A_mnkl, assumed_align=self.alignment)
+        B_cute = from_dlpack(B_mnkl, assumed_align=self.alignment)
+        C_cute = from_dlpack(C_mnkl, assumed_align=self.alignment)
+
+        # Set CUTLASS data types
+        A_cute.element_type = self.dtype_cutlass
+        B_cute.element_type = self.dtype_cutlass
+        C_cute.element_type = self.dtype_cutlass
+
+        # Mark layouts as dynamic
+        A_cute = A_cute.mark_layout_dynamic(leading_dim=1)
+        B_cute = B_cute.mark_layout_dynamic(leading_dim=1)
+        C_cute = C_cute.mark_layout_dynamic(leading_dim=1)
+
+        return A_cute, B_cute, C_cute
+
+    def _get_tensormap_buffer(self, device):
+        """Get or create tensormap buffer"""
+        if device not in self._tensormap_buffers:
+            sm_count = self.hardware_info.get_max_active_clusters(1)
+            tensormap_tensor = torch.zeros(
+                (sm_count, 3, 128 // 8),  # 3 tensormaps (A, B, C), 128 bytes each
+                dtype=torch.int64,
+                device=device,
+            )
+            self._tensormap_buffers[device] = from_dlpack(
+                tensormap_tensor, assumed_align=self.alignment
+            )
+
+        return self._tensormap_buffers[device]
+
+    def _compute_total_clusters(self, problem_sizes):
+        """Compute total number of clusters needed"""
+        cluster_tile_m = 128  # From mma_tiler_mn[0]
+        cluster_tile_n = 64  # From mma_tiler_mn[1]
+
+        total = 0
+        for M, N, K, L in problem_sizes:
+            clusters_m = (M + cluster_tile_m - 1) // cluster_tile_m
+            clusters_n = (N + cluster_tile_n - 1) // cluster_tile_n
+            total += clusters_m * clusters_n
+
+        return total
+
+    def _apply_activation_and_combine(self, gate_outputs, up_outputs, m_sizes):
+        """Apply activation and combine gate/up outputs"""
+        hidden_states = []
+
+        for gate_out, up_out in zip(gate_outputs, up_outputs):
+            # Apply activation to gate output and multiply with up output
+            activated_gate = self.activation_function(gate_out)
+            combined = activated_gate * up_out
+            hidden_states.append(combined)
+
+        return hidden_states
+
+    def _reconstruct_output(self, final_outputs, m_sizes, output):
+        """Reconstruct the full output tensor from expert results"""
+        offset = 0
+        expert_idx = 0
+
+        for size in m_sizes:
+            if size > 0 and expert_idx < len(final_outputs):
+                output[offset : offset + size] = final_outputs[expert_idx]
+                expert_idx += 1
+            offset += size
+
+        return output
+
+    @staticmethod
+    def is_available() -> bool:
+        return CUTLASS_AVAILABLE
+
+
 class TritonCGBF16GroupGEMM(GroupGEMMStrategy):
     """Implementation of Triton Contiguous group Gemm"""
 
diff --git a/torchtitan/experiments/deepseek_v3/test_moe.py b/torchtitan/experiments/deepseek_v3/test_moe.py
new file mode 100644
index 000000000..45d2089d7
--- /dev/null
+++ b/torchtitan/experiments/deepseek_v3/test_moe.py
@@ -0,0 +1,630 @@
+#!/usr/bin/env python3
+"""
+Simple MoE Benchmark using CUTLASSGroupedGemmStrategy
+
+This benchmark creates a realistic MoE layer and compares:
+1. CUTLASSGroupedGemmStrategy (our optimized approach)
+2. Manual looping through experts (baseline)
+3. PyTorch grouped_mm (if available)
+
+Measures performance across different scales and expert utilization patterns.
+"""
+
+import gc
+import math
+import time
+from typing import List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+# Import CUTLASS components
+try:
+    import cuda.bindings.driver as cuda
+    import cutlass
+    import cutlass.cute as cute
+    import cutlass.torch as cutlass_torch
+    import cutlass.utils as utils
+
+    # Import our strategy - UPDATE PATH AS NEEDED
+
+    from cutlass.cute.runtime import from_dlpack
+    from group_gemms import CUTLASSGroupedGemmStrategy, ManualLoopGroupGEMM
+
+    HAS_CUTLASS = True
+    print("✓ CUTLASS and strategies imported successfully")
+except ImportError as e:
+    HAS_CUTLASS = False
+    print(f"✗ Import failed: {e}")
+    print("Using PyTorch fallback implementations only")
+
+
+class SimpleMoELayer(nn.Module):
+    """
+    Simplified MoE layer for benchmarking
+
+    Architecture:
+    - Router: Linear layer that outputs expert probabilities
+    - Experts: Each expert has gate_proj -> activation -> up_proj -> down_proj
+    - Top-K routing: Each token is assigned to top_k experts
+    """
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        num_experts: int,
+        top_k: int = 2,
+        dtype: torch.dtype = torch.bfloat16,
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_experts = num_experts
+        self.top_k = top_k
+        self.dtype = dtype
+
+        # Router network
+        self.router = nn.Linear(hidden_size, num_experts, bias=False, dtype=dtype)
+
+        # Expert weights - stored as [num_experts, out_dim, in_dim] for our strategy
+        self.gate_weights = nn.Parameter(
+            torch.randn(num_experts, intermediate_size, hidden_size, dtype=dtype)
+            * math.sqrt(2.0 / (hidden_size + intermediate_size))
+        )
+
+        self.up_weights = nn.Parameter(
+            torch.randn(num_experts, intermediate_size, hidden_size, dtype=dtype)
+            * math.sqrt(2.0 / (hidden_size + intermediate_size))
+        )
+
+        self.down_weights = nn.Parameter(
+            torch.randn(num_experts, hidden_size, intermediate_size, dtype=dtype)
+            * math.sqrt(2.0 / (hidden_size + intermediate_size))
+        )
+
+        # Mock parameter access for strategies
+        self._expert_params = {
+            "gate_proj_weight": self.gate_weights,
+            "up_proj_weight": self.up_weights,
+            "down_proj_weight": self.down_weights,
+        }
+
+    def get_parameter(self, name: str):
+        """Strategy interface for accessing parameters"""
+        return self._expert_params.get(name)
+
+    def silu_activation(self, x):
+        """SiLU activation function"""
+        return x * torch.sigmoid(x)
+
+    def route_tokens(
+        self, hidden_states: torch.Tensor
+    ) -> Tuple[torch.Tensor, List[int], List[int], torch.Tensor]:
+        """
+        Route tokens to experts using top-k selection
+
+        Returns:
+            contig_tokens: Tokens arranged contiguously by expert assignment
+            m_sizes: Number of tokens assigned to each expert
+            m_offsets: Cumulative token offsets for each expert
+            routing_weights: Weights for combining expert outputs
+        """
+        batch_size, seq_len, hidden_size = hidden_states.shape
+        hidden_states = hidden_states.view(
+            -1, hidden_size
+        )  # [total_tokens, hidden_size]
+
+        # Get routing scores
+        router_logits = self.router(hidden_states)  # [total_tokens, num_experts]
+        routing_weights, selected_experts = torch.topk(
+            router_logits, self.top_k, dim=-1
+        )
+        routing_weights = F.softmax(routing_weights, dim=-1)
+
+        # For simplicity, assign each token to its top-1 expert
+        # In practice, you'd handle top-k routing with load balancing
+        top_expert = selected_experts[:, 0]  # [total_tokens]
+        token_weights = routing_weights[:, 0]  # [total_tokens]
+
+        # Count tokens per expert
+        m_sizes = [0] * self.num_experts
+        expert_tokens = [[] for _ in range(self.num_experts)]
+        expert_weights = [[] for _ in range(self.num_experts)]
+
+        for token_idx, expert_idx in enumerate(top_expert):
+            expert_idx = expert_idx.item()
+            m_sizes[expert_idx] += 1
+            expert_tokens[expert_idx].append(token_idx)
+            expert_weights[expert_idx].append(token_weights[token_idx])
+
+        # Create contiguous token arrangement
+        contig_tokens = []
+        token_to_output_pos = {}
+        current_pos = 0
+
+        for expert_idx in range(self.num_experts):
+            if m_sizes[expert_idx] > 0:
+                expert_token_indices = expert_tokens[expert_idx]
+                expert_hidden_states = hidden_states[expert_token_indices]
+                contig_tokens.append(expert_hidden_states)
+
+                # Track where each token should go in the output
+                for local_pos, global_token_idx in enumerate(expert_token_indices):
+                    token_to_output_pos[global_token_idx] = current_pos + local_pos
+
+                current_pos += m_sizes[expert_idx]
+
+        if contig_tokens:
+            contig_tokens = torch.cat(contig_tokens, dim=0)
+        else:
+            contig_tokens = torch.empty(
+                0, hidden_size, dtype=self.dtype, device=hidden_states.device
+            )
+
+        # Create offsets
+        m_offsets = []
+        cumsum = 0
+        for size in m_sizes:
+            cumsum += size
+            m_offsets.append(cumsum)
+
+        # Store routing info for output reconstruction
+        self._routing_info = {
+            "token_to_output_pos": token_to_output_pos,
+            "expert_weights": expert_weights,
+            "original_shape": (batch_size, seq_len, hidden_size),
+        }
+
+        return contig_tokens, m_sizes, m_offsets, token_weights
+
+    def reconstruct_output(self, expert_outputs: torch.Tensor) -> torch.Tensor:
+        """Reconstruct output tensor from expert results"""
+        routing_info = self._routing_info
+        batch_size, seq_len, hidden_size = routing_info["original_shape"]
+        total_tokens = batch_size * seq_len
+
+        # Initialize output
+        output = torch.zeros(
+            total_tokens, hidden_size, dtype=self.dtype, device=expert_outputs.device
+        )
+
+        # Place expert outputs back in original token positions
+        current_pos = 0
+        for expert_idx in range(self.num_experts):
+            if len(routing_info["expert_weights"][expert_idx]) > 0:
+                expert_size = len(routing_info["expert_weights"][expert_idx])
+                expert_output = expert_outputs[current_pos : current_pos + expert_size]
+                expert_weight_list = routing_info["expert_weights"][expert_idx]
+
+                # Apply expert-specific routing weights
+                for local_pos, (global_token_idx, weight) in enumerate(
+                    zip(
+                        [
+                            k
+                            for k, v in routing_info["token_to_output_pos"].items()
+                            if current_pos <= v < current_pos + expert_size
+                        ],
+                        expert_weight_list,
+                    )
+                ):
+                    output[global_token_idx] = expert_output[local_pos] * weight
+
+                current_pos += expert_size
+
+        return output.view(batch_size, seq_len, hidden_size)
+
+
+class MoEBenchmark:
+    """Benchmark harness for MoE implementations"""
+
+    def __init__(self):
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.dtype = torch.bfloat16
+
+    def create_test_data(
+        self, batch_size: int, seq_len: int, hidden_size: int
+    ) -> torch.Tensor:
+        """Create test input data"""
+        return (
+            torch.randn(
+                batch_size, seq_len, hidden_size, dtype=self.dtype, device=self.device
+            )
+            * 0.02
+        )
+
+    def benchmark_cutlass_strategy(
+        self,
+        moe_layer: SimpleMoELayer,
+        hidden_states: torch.Tensor,
+        iterations: int = 10,
+    ) -> Tuple[torch.Tensor, float]:
+        """Benchmark CUTLASS grouped GEMM strategy"""
+        strategy = CUTLASSGroupedGemmStrategy(moe_layer.silu_activation)
+
+        # Warmup
+        for _ in range(3):
+            contig_tokens, m_sizes, m_offsets, _ = moe_layer.route_tokens(hidden_states)
+            if sum(m_sizes) > 0:
+                expert_outputs = strategy.execute(
+                    contig_tokens, m_sizes, m_offsets, moe_layer
+                )
+
+        torch.cuda.synchronize()
+
+        # Benchmark
+        start_time = time.time()
+
+        for _ in range(iterations):
+            contig_tokens, m_sizes, m_offsets, _ = moe_layer.route_tokens(hidden_states)
+            if sum(m_sizes) > 0:
+                expert_outputs = strategy.execute(
+                    contig_tokens, m_sizes, m_offsets, moe_layer
+                )
+            else:
+                expert_outputs = torch.empty(
+                    0, moe_layer.hidden_size, dtype=self.dtype, device=self.device
+                )
+
+        torch.cuda.synchronize()
+        end_time = time.time()
+
+        # Final forward pass for output
+        contig_tokens, m_sizes, m_offsets, _ = moe_layer.route_tokens(hidden_states)
+        if sum(m_sizes) > 0:
+            expert_outputs = strategy.execute(
+                contig_tokens, m_sizes, m_offsets, moe_layer
+            )
+        else:
+            expert_outputs = torch.empty(
+                0, moe_layer.hidden_size, dtype=self.dtype, device=self.device
+            )
+
+        final_output = moe_layer.reconstruct_output(expert_outputs)
+        avg_time = (end_time - start_time) / iterations * 1000  # ms
+
+        return final_output, avg_time
+
+    def benchmark_manual_strategy(
+        self,
+        moe_layer: SimpleMoELayer,
+        hidden_states: torch.Tensor,
+        iterations: int = 10,
+    ) -> Tuple[torch.Tensor, float]:
+        """Benchmark manual loop strategy"""
+        strategy = ManualLoopGroupGEMM(moe_layer.silu_activation)
+
+        # Warmup
+        for _ in range(3):
+            contig_tokens, m_sizes, m_offsets, _ = moe_layer.route_tokens(hidden_states)
+            if sum(m_sizes) > 0:
+                expert_outputs = strategy.execute(
+                    contig_tokens, m_sizes, m_offsets, moe_layer
+                )
+
+        torch.cuda.synchronize()
+
+        # Benchmark
+        start_time = time.time()
+
+        for _ in range(iterations):
+            contig_tokens, m_sizes, m_offsets, _ = moe_layer.route_tokens(hidden_states)
+            if sum(m_sizes) > 0:
+                expert_outputs = strategy.execute(
+                    contig_tokens, m_sizes, m_offsets, moe_layer
+                )
+            else:
+                expert_outputs = torch.empty(
+                    0, moe_layer.hidden_size, dtype=self.dtype, device=self.device
+                )
+
+        torch.cuda.synchronize()
+        end_time = time.time()
+
+        # Final forward pass for output
+        contig_tokens, m_sizes, m_offsets, _ = moe_layer.route_tokens(hidden_states)
+        if sum(m_sizes) > 0:
+            expert_outputs = strategy.execute(
+                contig_tokens, m_sizes, m_offsets, moe_layer
+            )
+        else:
+            expert_outputs = torch.empty(
+                0, moe_layer.hidden_size, dtype=self.dtype, device=self.device
+            )
+
+        final_output = moe_layer.reconstruct_output(expert_outputs)
+        avg_time = (end_time - start_time) / iterations * 1000  # ms
+
+        return final_output, avg_time
+
+    def benchmark_pytorch_grouped_mm(
+        self,
+        moe_layer: SimpleMoELayer,
+        hidden_states: torch.Tensor,
+        iterations: int = 10,
+    ) -> Tuple[torch.Tensor, float]:
+        """Benchmark PyTorch _grouped_mm if available"""
+        # Check if _grouped_mm exists and we're on a supported device
+        if not hasattr(torch, "_grouped_mm"):
+            return None, float("inf")
+
+        # Check if we're on Blackwell (compute capability 9.0)
+        device_props = torch.cuda.get_device_properties(hidden_states.device)
+        if device_props.major != 9:
+            print("  Skipping torch._grouped_mm: requires compute capability 9.0")
+            return None, float("inf")
+
+        # Warmup and benchmark similar to other methods
+        for _ in range(3):
+            contig_tokens, m_sizes, m_offsets, _ = moe_layer.route_tokens(hidden_states)
+            if sum(m_sizes) > 0:
+                # Simulate grouped_mm operations
+                self._pytorch_grouped_mm_forward(moe_layer, contig_tokens, m_offsets)
+
+        torch.cuda.synchronize()
+
+        start_time = time.time()
+
+        for _ in range(iterations):
+            contig_tokens, m_sizes, m_offsets, _ = moe_layer.route_tokens(hidden_states)
+            if sum(m_sizes) > 0:
+                expert_outputs = self._pytorch_grouped_mm_forward(
+                    moe_layer, contig_tokens, m_offsets
+                )
+            else:
+                expert_outputs = torch.empty(
+                    0, moe_layer.hidden_size, dtype=self.dtype, device=self.device
+                )
+
+        torch.cuda.synchronize()
+        end_time = time.time()
+
+        # Final forward pass
+        contig_tokens, m_sizes, m_offsets, _ = moe_layer.route_tokens(hidden_states)
+        if sum(m_sizes) > 0:
+            expert_outputs = self._pytorch_grouped_mm_forward(
+                moe_layer, contig_tokens, m_offsets
+            )
+        else:
+            expert_outputs = torch.empty(
+                0, moe_layer.hidden_size, dtype=self.dtype, device=self.device
+            )
+
+        final_output = moe_layer.reconstruct_output(expert_outputs)
+        avg_time = (end_time - start_time) / iterations * 1000  # ms
+
+        return final_output, avg_time
+
+    def _pytorch_grouped_mm_forward(
+        self, moe_layer: SimpleMoELayer, tokens: torch.Tensor, m_offsets: List[int]
+    ) -> torch.Tensor:
+        # Convert m_offsets list to tensor
+        m_offsets_tensor = torch.tensor(m_offsets, device=tokens.device)
+
+        # Gate and up projections
+        gate_proj = torch._grouped_mm(
+            tokens,
+            moe_layer.gate_weights.transpose(-2, -1),
+            m_offsets_tensor,  # Use tensor instead of list
+            out_dtype=self.dtype,
+        )
+        up_proj = torch._grouped_mm(
+            tokens,
+            moe_layer.up_weights.transpose(-2, -1),
+            m_offsets_tensor,  # Use tensor instead of list
+            out_dtype=self.dtype,
+        )
+
+        # Apply activation and combine
+        hidden_outputs = moe_layer.silu_activation(gate_proj) * up_proj
+
+        # Down projection
+        final_outputs = torch._grouped_mm(
+            hidden_outputs,
+            moe_layer.down_weights.transpose(-2, -1),
+            m_offsets_tensor,  # Use tensor instead of list
+            out_dtype=self.dtype,
+        )
+
+        return final_outputs
+
+    def validate_outputs(
+        self, output1: torch.Tensor, output2: torch.Tensor, tolerance: float = 1e-1
+    ) -> bool:
+        """Validate that two outputs are close"""
+        if output1.shape != output2.shape:
+            print(f"Shape mismatch: {output1.shape} vs {output2.shape}")
+            return False
+
+        diff = torch.abs(output1 - output2)
+        max_diff = torch.max(diff).item()
+        rel_error = torch.norm(diff).item() / torch.norm(output1).item()
+
+        print(f"    Max diff: {max_diff:.6f}, Rel error: {rel_error:.6f}")
+
+        return max_diff < tolerance and rel_error < tolerance
+
+    def run_benchmark_suite(self):
+        """Run comprehensive benchmark suite"""
+        print("=" * 80)
+        print("MoE Benchmark Suite")
+        print("=" * 80)
+
+        # Test configurations
+        configs = [
+            {
+                "name": "Small MoE",
+                "batch_size": 4,
+                "seq_len": 512,
+                "hidden_size": 512,
+                "intermediate_size": 1024,
+                "num_experts": 8,
+                "top_k": 2,
+            },
+            {
+                "name": "Medium MoE",
+                "batch_size": 8,
+                "seq_len": 1024,
+                "hidden_size": 1024,
+                "intermediate_size": 2048,
+                "num_experts": 16,
+                "top_k": 2,
+            },
+            {
+                "name": "Large MoE",
+                "batch_size": 16,
+                "seq_len": 2048,
+                "hidden_size": 2048,
+                "intermediate_size": 4096,
+                "num_experts": 32,
+                "top_k": 2,
+            },
+        ]
+
+        all_passed = True
+
+        for config in configs:
+            print(f"\n" + "=" * 60)
+            print(f"Benchmarking: {config['name']}")
+            print(
+                f"  Shape: [{config['batch_size']}, {config['seq_len']}, {config['hidden_size']}]"
+            )
+            print(f"  Experts: {config['num_experts']}, Top-K: {config['top_k']}")
+            print(f"  Intermediate size: {config['intermediate_size']}")
+            print("=" * 60)
+
+            try:
+                # Create test data
+                hidden_states = self.create_test_data(
+                    config["batch_size"], config["seq_len"], config["hidden_size"]
+                )
+
+                # Create MoE layer
+                moe_layer = SimpleMoELayer(
+                    hidden_size=config["hidden_size"],
+                    intermediate_size=config["intermediate_size"],
+                    num_experts=config["num_experts"],
+                    top_k=config["top_k"],
+                    dtype=self.dtype,
+                ).to(self.device)
+
+                # Show routing statistics
+                with torch.no_grad():
+                    contig_tokens, m_sizes, m_offsets, _ = moe_layer.route_tokens(
+                        hidden_states
+                    )
+                    active_experts = sum(1 for s in m_sizes if s > 0)
+                    total_tokens = sum(m_sizes)
+                    print(
+                        f"  Routing: {total_tokens} tokens across {active_experts}/{config['num_experts']} experts"
+                    )
+                    print(f"  Expert loads: {m_sizes}")
+
+                results = {}
+
+                # Benchmark manual strategy
+                print(f"\n  Benchmarking Manual Loop Strategy...")
+                manual_output, manual_time = self.benchmark_manual_strategy(
+                    moe_layer, hidden_states
+                )
+                results["manual"] = (manual_output, manual_time)
+                print(f"    Time: {manual_time:.2f} ms")
+
+                # Benchmark CUTLASS strategy
+                if HAS_CUTLASS:
+                    print(f"\n  Benchmarking CUTLASS Strategy...")
+                    cutlass_output, cutlass_time = self.benchmark_cutlass_strategy(
+                        moe_layer, hidden_states
+                    )
+                    results["cutlass"] = (cutlass_output, cutlass_time)
+                    print(f"    Time: {cutlass_time:.2f} ms")
+
+                    # Validate
+                    print(f"\n  Validating CUTLASS vs Manual...")
+                    if self.validate_outputs(manual_output, cutlass_output):
+                        print(f"    ✓ Validation passed")
+                        speedup = manual_time / cutlass_time
+                        print(f"    Speedup: {speedup:.2f}x")
+                    else:
+                        print(f"    ✗ Validation failed")
+                        all_passed = False
+
+                # Benchmark PyTorch grouped_mm
+                if hasattr(torch, "_grouped_mm"):
+                    print(f"\n  Benchmarking PyTorch grouped_mm...")
+                    pytorch_output, pytorch_time = self.benchmark_pytorch_grouped_mm(
+                        moe_layer, hidden_states
+                    )
+                    results["pytorch"] = (pytorch_output, pytorch_time)
+                    print(f"    Time: {pytorch_time:.2f} ms")
+
+                    if pytorch_output is not None:
+                        print(f"\n  Validating PyTorch vs Manual...")
+                        if self.validate_outputs(manual_output, pytorch_output):
+                            print(f"    ✓ Validation passed")
+                        else:
+                            print(f"    ✗ Validation failed")
+
+                # Summary
+                print(f"\n  Performance Summary:")
+                print(f"    Manual Loop:     {results['manual'][1]:.2f} ms")
+                if "cutlass" in results:
+                    print(f"    CUTLASS Grouped: {results['cutlass'][1]:.2f} ms")
+                if "pytorch" in results:
+                    print(f"    PyTorch grouped: {results['pytorch'][1]:.2f} ms")
+
+                # Calculate FLOPS
+                total_tokens = config["batch_size"] * config["seq_len"]
+                # Approximate FLOPs: 2 * (gate + up + down projections)
+                flops_per_token = 2 * (
+                    config["hidden_size"] * config["intermediate_size"] * 2  # gate + up
+                    + config["intermediate_size"] * config["hidden_size"]
+                )  # down
+                total_flops = total_tokens * flops_per_token
+
+                manual_tflops = total_flops / (results["manual"][1] * 1e-3) / 1e12
+                print(f"    Manual TFLOPS:   {manual_tflops:.2f}")
+
+                if "cutlass" in results:
+                    cutlass_tflops = total_flops / (results["cutlass"][1] * 1e-3) / 1e12
+                    print(f"    CUTLASS TFLOPS:  {cutlass_tflops:.2f}")
+
+            except Exception as e:
+                print(f"✗ {config['name']} failed: {e}")
+                import traceback
+
+                traceback.print_exc()
+                all_passed = False
+
+            finally:
+                # Cleanup
+                torch.cuda.empty_cache()
+                gc.collect()
+
+        print(f"\n" + "=" * 80)
+        if all_passed:
+            print("🎉 All benchmarks completed successfully!")
+        else:
+            print("⚠️  Some benchmarks failed")
+        print("=" * 80)
+
+        return all_passed
+
+
+def main():
+    """Run the MoE benchmark"""
+    benchmark = MoEBenchmark()
+
+    print("MoE Performance Benchmark")
+    print(f"Device: {benchmark.device}")
+    print(f"Dtype: {benchmark.dtype}")
+    print(f"CUTLASS Available: {HAS_CUTLASS}")
+    print(f"PyTorch grouped_mm Available: {hasattr(torch, '_grouped_mm')}")
+
+    success = benchmark.run_benchmark_suite()
+    return 0 if success else 1
+
+
+if __name__ == "__main__":
+    exit(main())

From 05d4506d3fd741b04f78d981eb3de1fd60c8ab1e Mon Sep 17 00:00:00 2001
From: lessw2020 <lessw@etrillium.com>
Date: Sun, 8 Jun 2025 21:03:48 -0700
Subject: [PATCH 04/34] use triton.do_bench for improved accuracy

---
 .../experiments/deepseek_v3/test_moe.py       | 334 ++++++++++++++----
 1 file changed, 259 insertions(+), 75 deletions(-)

diff --git a/torchtitan/experiments/deepseek_v3/test_moe.py b/torchtitan/experiments/deepseek_v3/test_moe.py
index 45d2089d7..2b422fd31 100644
--- a/torchtitan/experiments/deepseek_v3/test_moe.py
+++ b/torchtitan/experiments/deepseek_v3/test_moe.py
@@ -7,18 +7,28 @@
 2. Manual looping through experts (baseline)
 3. PyTorch grouped_mm (if available)
 
-Measures performance across different scales and expert utilization patterns.
+Uses Triton's do_bench for accurate GPU timing, with CUDA events as fallback.
 """
 
 import gc
 import math
 import time
-from typing import List, Optional, Tuple
+from typing import Callable, List, Optional, Tuple
 
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 
+# Import timing utilities
+try:
+    import triton.testing
+
+    HAS_TRITON_BENCH = True
+    print("✓ Triton do_bench available for accurate timing")
+except ImportError:
+    HAS_TRITON_BENCH = False
+    print("⚠️  Triton do_bench not available, using CUDA events")
+
 # Import CUTLASS components
 try:
     import cuda.bindings.driver as cuda
@@ -27,10 +37,12 @@
     import cutlass.torch as cutlass_torch
     import cutlass.utils as utils
 
-    # Import our strategy - UPDATE PATH AS NEEDED
-
     from cutlass.cute.runtime import from_dlpack
-    from group_gemms import CUTLASSGroupedGemmStrategy, ManualLoopGroupGEMM
+    from group_gemms import (
+        CUTLASSGroupedGemmStrategy,
+        GroupGEMMStrategy,
+        ManualLoopGroupGEMM,
+    )
 
     HAS_CUTLASS = True
     print("✓ CUTLASS and strategies imported successfully")
@@ -101,14 +113,14 @@ def silu_activation(self, x):
 
     def route_tokens(
         self, hidden_states: torch.Tensor
-    ) -> Tuple[torch.Tensor, List[int], List[int], torch.Tensor]:
+    ) -> Tuple[torch.Tensor, List[int], torch.Tensor, torch.Tensor]:
         """
         Route tokens to experts using top-k selection
 
         Returns:
             contig_tokens: Tokens arranged contiguously by expert assignment
             m_sizes: Number of tokens assigned to each expert
-            m_offsets: Cumulative token offsets for each expert
+            m_offsets: Cumulative token offsets for each expert (as torch.Tensor)
             routing_weights: Weights for combining expert outputs
         """
         batch_size, seq_len, hidden_size = hidden_states.shape
@@ -163,12 +175,16 @@ def route_tokens(
                 0, hidden_size, dtype=self.dtype, device=hidden_states.device
             )
 
-        # Create offsets
-        m_offsets = []
+        # Create offsets as torch.Tensor (required for PyTorch grouped_mm)
+        m_offsets_list = []
         cumsum = 0
         for size in m_sizes:
             cumsum += size
-            m_offsets.append(cumsum)
+            m_offsets_list.append(cumsum)
+
+        m_offsets = torch.tensor(
+            m_offsets_list, dtype=torch.int32, device=hidden_states.device
+        )
 
         # Store routing info for output reconstruction
         self._routing_info = {
@@ -223,6 +239,44 @@ def __init__(self):
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.dtype = torch.bfloat16
 
+        # Set timing method
+        if HAS_TRITON_BENCH:
+            self.timing_method = "triton"
+        else:
+            self.timing_method = "cuda_events"
+
+        # Get GPU architecture info
+        self.gpu_arch_info = self._get_gpu_arch_info()
+
+    def _get_gpu_arch_info(self):
+        """Get GPU architecture information"""
+        if not torch.cuda.is_available():
+            return {
+                "compute_capability": None,
+                "is_hopper": False,
+                "is_blackwell": False,
+            }
+
+        cap = torch.cuda.get_device_capability()
+        compute_capability = f"{cap[0]}.{cap[1]}"
+
+        return {
+            "compute_capability": compute_capability,
+            "is_hopper": cap[0] == 9,
+            "is_blackwell": cap[0] == 10,
+        }
+
+    def _is_pytorch_grouped_mm_available(self):
+        """Check if PyTorch grouped_mm is available and supported on this architecture"""
+        if not hasattr(torch, "_grouped_mm"):
+            return False
+
+        # Currently grouped_mm is only optimized for Hopper
+        if self.gpu_arch_info["is_blackwell"]:
+            return False
+
+        return True
+
     def create_test_data(
         self, batch_size: int, seq_len: int, hidden_size: int
     ) -> torch.Tensor:
@@ -251,24 +305,46 @@ def benchmark_cutlass_strategy(
                     contig_tokens, m_sizes, m_offsets, moe_layer
                 )
 
-        torch.cuda.synchronize()
+        # Use triton.do_bench if available
+        if self.timing_method == "triton":
 
-        # Benchmark
-        start_time = time.time()
-
-        for _ in range(iterations):
-            contig_tokens, m_sizes, m_offsets, _ = moe_layer.route_tokens(hidden_states)
-            if sum(m_sizes) > 0:
-                expert_outputs = strategy.execute(
-                    contig_tokens, m_sizes, m_offsets, moe_layer
+            def bench_fn():
+                contig_tokens, m_sizes, m_offsets, _ = moe_layer.route_tokens(
+                    hidden_states
                 )
-            else:
-                expert_outputs = torch.empty(
-                    0, moe_layer.hidden_size, dtype=self.dtype, device=self.device
+                if sum(m_sizes) > 0:
+                    expert_outputs = strategy.execute(
+                        contig_tokens, m_sizes, m_offsets, moe_layer
+                    )
+                else:
+                    expert_outputs = torch.empty(
+                        0, moe_layer.hidden_size, dtype=self.dtype, device=self.device
+                    )
+                torch.cuda.synchronize()
+
+            ms_times = triton.testing.do_bench(bench_fn, warmup=3, rep=iterations)
+            avg_time = ms_times
+        else:
+            # Fall back to CUDA events
+            torch.cuda.synchronize()
+            start_time = time.time()
+
+            for _ in range(iterations):
+                contig_tokens, m_sizes, m_offsets, _ = moe_layer.route_tokens(
+                    hidden_states
                 )
+                if sum(m_sizes) > 0:
+                    expert_outputs = strategy.execute(
+                        contig_tokens, m_sizes, m_offsets, moe_layer
+                    )
+                else:
+                    expert_outputs = torch.empty(
+                        0, moe_layer.hidden_size, dtype=self.dtype, device=self.device
+                    )
 
-        torch.cuda.synchronize()
-        end_time = time.time()
+            torch.cuda.synchronize()
+            end_time = time.time()
+            avg_time = (end_time - start_time) / iterations * 1000  # ms
 
         # Final forward pass for output
         contig_tokens, m_sizes, m_offsets, _ = moe_layer.route_tokens(hidden_states)
@@ -282,7 +358,6 @@ def benchmark_cutlass_strategy(
             )
 
         final_output = moe_layer.reconstruct_output(expert_outputs)
-        avg_time = (end_time - start_time) / iterations * 1000  # ms
 
         return final_output, avg_time
 
@@ -303,24 +378,46 @@ def benchmark_manual_strategy(
                     contig_tokens, m_sizes, m_offsets, moe_layer
                 )
 
-        torch.cuda.synchronize()
-
-        # Benchmark
-        start_time = time.time()
+        # Use triton.do_bench if available
+        if self.timing_method == "triton":
 
-        for _ in range(iterations):
-            contig_tokens, m_sizes, m_offsets, _ = moe_layer.route_tokens(hidden_states)
-            if sum(m_sizes) > 0:
-                expert_outputs = strategy.execute(
-                    contig_tokens, m_sizes, m_offsets, moe_layer
+            def bench_fn():
+                contig_tokens, m_sizes, m_offsets, _ = moe_layer.route_tokens(
+                    hidden_states
                 )
-            else:
-                expert_outputs = torch.empty(
-                    0, moe_layer.hidden_size, dtype=self.dtype, device=self.device
+                if sum(m_sizes) > 0:
+                    expert_outputs = strategy.execute(
+                        contig_tokens, m_sizes, m_offsets, moe_layer
+                    )
+                else:
+                    expert_outputs = torch.empty(
+                        0, moe_layer.hidden_size, dtype=self.dtype, device=self.device
+                    )
+                torch.cuda.synchronize()
+
+            ms_times = triton.testing.do_bench(bench_fn, warmup=3, rep=iterations)
+            avg_time = ms_times
+        else:
+            # Fall back to CUDA events
+            torch.cuda.synchronize()
+            start_time = time.time()
+
+            for _ in range(iterations):
+                contig_tokens, m_sizes, m_offsets, _ = moe_layer.route_tokens(
+                    hidden_states
                 )
+                if sum(m_sizes) > 0:
+                    expert_outputs = strategy.execute(
+                        contig_tokens, m_sizes, m_offsets, moe_layer
+                    )
+                else:
+                    expert_outputs = torch.empty(
+                        0, moe_layer.hidden_size, dtype=self.dtype, device=self.device
+                    )
 
-        torch.cuda.synchronize()
-        end_time = time.time()
+            torch.cuda.synchronize()
+            end_time = time.time()
+            avg_time = (end_time - start_time) / iterations * 1000  # ms
 
         # Final forward pass for output
         contig_tokens, m_sizes, m_offsets, _ = moe_layer.route_tokens(hidden_states)
@@ -334,7 +431,6 @@ def benchmark_manual_strategy(
             )
 
         final_output = moe_layer.reconstruct_output(expert_outputs)
-        avg_time = (end_time - start_time) / iterations * 1000  # ms
 
         return final_output, avg_time
 
@@ -345,16 +441,9 @@ def benchmark_pytorch_grouped_mm(
         iterations: int = 10,
     ) -> Tuple[torch.Tensor, float]:
         """Benchmark PyTorch _grouped_mm if available"""
-        # Check if _grouped_mm exists and we're on a supported device
         if not hasattr(torch, "_grouped_mm"):
             return None, float("inf")
 
-        # Check if we're on Blackwell (compute capability 9.0)
-        device_props = torch.cuda.get_device_properties(hidden_states.device)
-        if device_props.major != 9:
-            print("  Skipping torch._grouped_mm: requires compute capability 9.0")
-            return None, float("inf")
-
         # Warmup and benchmark similar to other methods
         for _ in range(3):
             contig_tokens, m_sizes, m_offsets, _ = moe_layer.route_tokens(hidden_states)
@@ -399,20 +488,18 @@ def benchmark_pytorch_grouped_mm(
     def _pytorch_grouped_mm_forward(
         self, moe_layer: SimpleMoELayer, tokens: torch.Tensor, m_offsets: List[int]
     ) -> torch.Tensor:
-        # Convert m_offsets list to tensor
-        m_offsets_tensor = torch.tensor(m_offsets, device=tokens.device)
-
+        """Simulate PyTorch grouped_mm forward pass"""
         # Gate and up projections
         gate_proj = torch._grouped_mm(
             tokens,
             moe_layer.gate_weights.transpose(-2, -1),
-            m_offsets_tensor,  # Use tensor instead of list
+            m_offsets,
             out_dtype=self.dtype,
         )
         up_proj = torch._grouped_mm(
             tokens,
             moe_layer.up_weights.transpose(-2, -1),
-            m_offsets_tensor,  # Use tensor instead of list
+            m_offsets,
             out_dtype=self.dtype,
         )
 
@@ -423,7 +510,7 @@ def _pytorch_grouped_mm_forward(
         final_outputs = torch._grouped_mm(
             hidden_outputs,
             moe_layer.down_weights.transpose(-2, -1),
-            m_offsets_tensor,  # Use tensor instead of list
+            m_offsets,
             out_dtype=self.dtype,
         )
 
@@ -520,6 +607,7 @@ def run_benchmark_suite(self):
                         f"  Routing: {total_tokens} tokens across {active_experts}/{config['num_experts']} experts"
                     )
                     print(f"  Expert loads: {m_sizes}")
+                    print(f"  Offsets (tensor): {m_offsets.tolist()}")
 
                 results = {}
 
@@ -529,7 +617,7 @@ def run_benchmark_suite(self):
                     moe_layer, hidden_states
                 )
                 results["manual"] = (manual_output, manual_time)
-                print(f"    Time: {manual_time:.2f} ms")
+                print(f"    Time: {manual_time:.3f} ms")
 
                 # Benchmark CUTLASS strategy
                 if HAS_CUTLASS:
@@ -538,7 +626,7 @@ def run_benchmark_suite(self):
                         moe_layer, hidden_states
                     )
                     results["cutlass"] = (cutlass_output, cutlass_time)
-                    print(f"    Time: {cutlass_time:.2f} ms")
+                    print(f"    Time: {cutlass_time:.3f} ms")
 
                     # Validate
                     print(f"\n  Validating CUTLASS vs Manual...")
@@ -546,49 +634,123 @@ def run_benchmark_suite(self):
                         print(f"    ✓ Validation passed")
                         speedup = manual_time / cutlass_time
                         print(f"    Speedup: {speedup:.2f}x")
+
+                        if speedup > 1.1:
+                            print(f"    🚀 CUTLASS is faster!")
+                        elif speedup < 0.9:
+                            print(f"    ⚠️  CUTLASS is slower - may indicate an issue")
+                        else:
+                            print(f"    ≈ Performance is similar")
                     else:
                         print(f"    ✗ Validation failed")
                         all_passed = False
 
-                # Benchmark PyTorch grouped_mm
-                if hasattr(torch, "_grouped_mm"):
+                # Benchmark PyTorch grouped_mm (if available and supported)
+                if self._is_pytorch_grouped_mm_available():
                     print(f"\n  Benchmarking PyTorch grouped_mm...")
                     pytorch_output, pytorch_time = self.benchmark_pytorch_grouped_mm(
                         moe_layer, hidden_states
                     )
-                    results["pytorch"] = (pytorch_output, pytorch_time)
-                    print(f"    Time: {pytorch_time:.2f} ms")
-
                     if pytorch_output is not None:
+                        results["pytorch"] = (pytorch_output, pytorch_time)
+                        print(f"    Time: {pytorch_time:.3f} ms")
+
                         print(f"\n  Validating PyTorch vs Manual...")
                         if self.validate_outputs(manual_output, pytorch_output):
                             print(f"    ✓ Validation passed")
+                            speedup_pytorch = manual_time / pytorch_time
+                            print(f"    Speedup vs Manual: {speedup_pytorch:.2f}x")
                         else:
                             print(f"    ✗ Validation failed")
+                else:
+                    # Explain why PyTorch grouped_mm is not available
+                    if not hasattr(torch, "_grouped_mm"):
+                        print(
+                            f"\n  PyTorch grouped_mm not available (requires PyTorch 2.4+)"
+                        )
+                    elif self.gpu_arch_info["is_blackwell"]:
+                        print(
+                            f"\n  PyTorch grouped_mm disabled on Blackwell (Hopper-only currently)"
+                        )
+                    else:
+                        print(
+                            f"\n  PyTorch grouped_mm not available on this architecture"
+                        )
 
-                # Summary
-                print(f"\n  Performance Summary:")
-                print(f"    Manual Loop:     {results['manual'][1]:.2f} ms")
+                # Performance summary with timing method info
+                print(f"\n  Performance Summary ({self.timing_method} timing):")
+                print(f"    Manual Loop:     {results['manual'][1]:.3f} ms")
                 if "cutlass" in results:
-                    print(f"    CUTLASS Grouped: {results['cutlass'][1]:.2f} ms")
+                    speedup = results["manual"][1] / results["cutlass"][1]
+                    print(
+                        f"    CUTLASS Grouped: {results['cutlass'][1]:.3f} ms ({speedup:.2f}x)"
+                    )
                 if "pytorch" in results:
-                    print(f"    PyTorch grouped: {results['pytorch'][1]:.2f} ms")
+                    speedup_pytorch = results["manual"][1] / results["pytorch"][1]
+                    print(
+                        f"    PyTorch grouped: {results['pytorch'][1]:.3f} ms ({speedup_pytorch:.2f}x)"
+                    )
+                else:
+                    print(f"    PyTorch grouped: Not available")
 
-                # Calculate FLOPS
+                # Calculate FLOPS (more detailed)
                 total_tokens = config["batch_size"] * config["seq_len"]
-                # Approximate FLOPs: 2 * (gate + up + down projections)
+                # More accurate FLOP counting:
+                # Gate: tokens * hidden * intermediate * 2 (FMA)
+                # Up: tokens * hidden * intermediate * 2 (FMA)
+                # Down: tokens * intermediate * hidden * 2 (FMA)
                 flops_per_token = 2 * (
-                    config["hidden_size"] * config["intermediate_size"] * 2  # gate + up
-                    + config["intermediate_size"] * config["hidden_size"]
-                )  # down
-                total_flops = total_tokens * flops_per_token
+                    config["hidden_size"] * config["intermediate_size"]  # gate
+                    + config["hidden_size"] * config["intermediate_size"]  # up
+                    + config["intermediate_size"] * config["hidden_size"]  # down
+                )
+                total_flops = (
+                    total_tokens
+                    * flops_per_token
+                    * sum(1 for s in m_sizes if s > 0)
+                    / config["num_experts"]
+                )  # Adjust for active experts
+
+                print(f"\n  FLOPS Analysis:")
+                print(f"    Total FLOPs: {total_flops/1e9:.2f} GFLOP")
 
                 manual_tflops = total_flops / (results["manual"][1] * 1e-3) / 1e12
-                print(f"    Manual TFLOPS:   {manual_tflops:.2f}")
+                print(f"    Manual TFLOPS:   {manual_tflops:.3f}")
 
                 if "cutlass" in results:
                     cutlass_tflops = total_flops / (results["cutlass"][1] * 1e-3) / 1e12
-                    print(f"    CUTLASS TFLOPS:  {cutlass_tflops:.2f}")
+                    efficiency = cutlass_tflops / manual_tflops * 100
+                    print(
+                        f"    CUTLASS TFLOPS:  {cutlass_tflops:.3f} ({efficiency:.1f}% of manual)"
+                    )
+
+                if "pytorch" in results:
+                    pytorch_tflops = total_flops / (results["pytorch"][1] * 1e-3) / 1e12
+                    print(f"    PyTorch TFLOPS:  {pytorch_tflops:.3f}")
+
+                # Memory bandwidth analysis
+                total_params = (
+                    config["num_experts"]
+                    * config["hidden_size"]
+                    * config["intermediate_size"]
+                    * 2  # gate + up
+                    + config["num_experts"]
+                    * config["intermediate_size"]
+                    * config["hidden_size"]  # down
+                )
+                param_size_gb = total_params * 2 / 1e9  # BF16 = 2 bytes
+
+                print(f"\n  Memory Analysis:")
+                print(
+                    f"    Total parameters: {total_params/1e6:.1f}M ({param_size_gb:.2f} GB)"
+                )
+
+                manual_bandwidth = param_size_gb / (results["manual"][1] * 1e-3)
+                print(f"    Manual bandwidth: {manual_bandwidth:.1f} GB/s")
+
+                if "cutlass" in results:
+                    cutlass_bandwidth = param_size_gb / (results["cutlass"][1] * 1e-3)
+                    print(f"    CUTLASS bandwidth: {cutlass_bandwidth:.1f} GB/s")
 
             except Exception as e:
                 print(f"✗ {config['name']} failed: {e}")
@@ -616,11 +778,33 @@ def main():
     """Run the MoE benchmark"""
     benchmark = MoEBenchmark()
 
-    print("MoE Performance Benchmark")
+    print("MoE Performance Benchmark with Accurate GPU Timing")
     print(f"Device: {benchmark.device}")
     print(f"Dtype: {benchmark.dtype}")
+    print(f"Timing Method: {benchmark.timing_method}")
     print(f"CUTLASS Available: {HAS_CUTLASS}")
-    print(f"PyTorch grouped_mm Available: {hasattr(torch, '_grouped_mm')}")
+    print(
+        f"PyTorch grouped_mm Available: {benchmark._is_pytorch_grouped_mm_available()}"
+    )
+
+    # Show architecture-specific information
+    if torch.cuda.is_available():
+        arch_info = benchmark.gpu_arch_info
+        if arch_info["is_blackwell"]:
+            print(
+                f"🚫 PyTorch grouped_mm disabled on Blackwell (compute capability {arch_info['compute_capability']})"
+            )
+        elif arch_info["is_hopper"]:
+            print(
+                f"✅ PyTorch grouped_mm available on Hopper (compute capability {arch_info['compute_capability']})"
+            )
+
+    if benchmark.timing_method == "triton":
+        print("🎯 Using Triton do_bench for most accurate GPU timing")
+    elif benchmark.timing_method == "cuda_events":
+        print("⏱️  Using CUDA events for accurate GPU timing")
+    else:
+        print("⚠️  Using CPU timing - results may be less accurate")
 
     success = benchmark.run_benchmark_suite()
     return 0 if success else 1

From a570007a2a9103dbb6fcbf9f7920a6cd07abaea6 Mon Sep 17 00:00:00 2001
From: lessw2020 <lessw@etrillium.com>
Date: Sun, 8 Jun 2025 21:23:57 -0700
Subject: [PATCH 05/34] add xlarge config to mimic deepseek (256 experts)

---
 torchtitan/experiments/deepseek_v3/test_moe.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/torchtitan/experiments/deepseek_v3/test_moe.py b/torchtitan/experiments/deepseek_v3/test_moe.py
index 2b422fd31..5a3ed3dcd 100644
--- a/torchtitan/experiments/deepseek_v3/test_moe.py
+++ b/torchtitan/experiments/deepseek_v3/test_moe.py
@@ -567,6 +567,15 @@ def run_benchmark_suite(self):
                 "num_experts": 32,
                 "top_k": 2,
             },
+            {
+                "name": "X-Large MoE",
+                "batch_size": 8,
+                "seq_len": 4096,
+                "hidden_size": 2048,
+                "intermediate_size": 4096,
+                "num_experts": 256,
+                "top_k": 6,
+            },
         ]
 
         all_passed = True

From 93db6c08cec76d11deef5405c63952706b438f4c Mon Sep 17 00:00:00 2001
From: lessw2020 <lessw@etrillium.com>
Date: Tue, 10 Jun 2025 11:43:14 -0700
Subject: [PATCH 06/34] add gg driver to test out configs

---
 torchtitan/experiments/kernels/blackwell/gg_driver.sh | 6 ++++++
 1 file changed, 6 insertions(+)
 create mode 100644 torchtitan/experiments/kernels/blackwell/gg_driver.sh

diff --git a/torchtitan/experiments/kernels/blackwell/gg_driver.sh b/torchtitan/experiments/kernels/blackwell/gg_driver.sh
new file mode 100644
index 000000000..4cf1352f1
--- /dev/null
+++ b/torchtitan/experiments/kernels/blackwell/gg_driver.sh
@@ -0,0 +1,6 @@
+python cute_grouped_gemm.py  \
+--use_2cta_instrs   \
+--ab_dtype Float16 --c_dtype Float16 --acc_dtype Float32 \
+--mma_tiler_mn 128,128 --cluster_shape_mn 4,4  \
+ --problem_sizes_mnkl "(8192,1280,32,1),(8,4096,1536,1),(640,1280,16,1),(640,512,16,1)"       \
+--num_groups 4  --tensormap_update_mode SMEM

From 86dae8be9de582fbce2cfe629a0c0e309f209967 Mon Sep 17 00:00:00 2001
From: lessw2020 <lessw@etrillium.com>
Date: Wed, 11 Jun 2025 05:59:59 -0700
Subject: [PATCH 07/34] start on 2cta and larger cluster

---
 .../experiments/deepseek_v3/group_gemms.py    | 563 ++++++++++++++++++
 1 file changed, 563 insertions(+)

diff --git a/torchtitan/experiments/deepseek_v3/group_gemms.py b/torchtitan/experiments/deepseek_v3/group_gemms.py
index ca93fffa6..f05d5fcab 100644
--- a/torchtitan/experiments/deepseek_v3/group_gemms.py
+++ b/torchtitan/experiments/deepseek_v3/group_gemms.py
@@ -68,6 +68,10 @@
     print(f"✗ Import failed: {e}")
     print("Using PyTorch fallback implementations only")
 
+import logging
+
+logger = logging.getLogger(__name__)
+
 
 # Strategy base class for GroupGEMM implementations
 class GroupGEMMStrategy:
@@ -184,6 +188,565 @@ def is_available() -> bool:
 
 
 class CUTLASSGroupedGemmStrategy(GroupGEMMStrategy):
+    """Fixed CUTLASS GroupedGemmKernel implementation"""
+
+    def __init__(self, custom_activation):
+        super().__init__(custom_activation)
+        self.dtype_torch = torch.bfloat16
+        self.dtype_cutlass = cutlass.BFloat16
+        self.acc_dtype = cutlass.Float32
+        self.mma_tiler_m = 128  # can only be 128 or 256
+        self.mma_tiler_n = 64  # can only be 64 or 128
+
+        print("Initializing CUTLASS GroupedGemmKernel")
+
+        # Kernel configuration
+        self.grouped_gemm = GroupedGemmKernel(
+            acc_dtype=self.acc_dtype,
+            use_2cta_instrs=False,
+            mma_tiler_mn=(self.mma_tiler_m, self.mma_tiler_n),
+            cluster_shape_mn=(1, 1),
+            tensormap_update_mode=utils.TensorMapUpdateMode.SMEM,
+        )
+
+        # Hardware info
+        self.hardware_info = utils.HardwareInfo()
+        self.max_active_clusters = self.hardware_info.get_max_active_clusters(1)
+
+        # Buffers
+        self._tensormap_buffers = {}
+        self._compiled_kernels = {}
+
+    def arrange_expert_weights(self, all_weights, submod_name, module):
+        """Prepare expert weights for CUTLASS grouped GEMM"""
+        # Stack weights: [num_experts, out_dim, in_dim]
+        combined_weights = torch.stack(all_weights)
+        print(f"CUTLASS arranged weights {submod_name}: {combined_weights.shape}")
+        return combined_weights
+
+    def _create_tensor_metadata(self, tokens, w_gate, w_up, w_down, m_sizes, device):
+        """Create the metadata tensors required by CUTLASS GroupedGEMM"""
+        # Filter out empty groups and create contiguous data
+        valid_groups = []
+        problem_sizes = []
+        strides_abc = []
+        ptrs_abc = []
+        output_tensors = []
+
+        offset = 0
+        for expert_idx, size in enumerate(m_sizes):
+            if size > 0:
+                # Get input tokens for this expert
+                expert_tokens = tokens[offset : offset + size].contiguous()
+
+                # Get weights for this expert
+                gate_weight = w_gate[
+                    expert_idx
+                ].contiguous()  # [intermediate_size, hidden_size]
+                up_weight = w_up[expert_idx].contiguous()
+                down_weight = w_down[
+                    expert_idx
+                ].contiguous()  # [hidden_size, intermediate_size]
+
+                # Create tensors for each GEMM operation in MoE forward pass
+                M = size  # Number of tokens for this expert
+                K_in = expert_tokens.shape[1]  # hidden_size
+                N_intermediate = gate_weight.shape[0]  # intermediate_size
+                N_out = down_weight.shape[0]  # hidden_size (output)
+                L = 1  # Batch dimension
+
+                # Store group info
+                group_info = {
+                    "expert_idx": expert_idx,
+                    "tokens": expert_tokens,
+                    "gate_weight": gate_weight,
+                    "up_weight": up_weight,
+                    "down_weight": down_weight,
+                    "M": M,
+                    "K_in": K_in,
+                    "N_intermediate": N_intermediate,
+                    "N_out": N_out,
+                }
+                valid_groups.append(group_info)
+
+                # For gate projection: A=[M,K], B=[N,K], C=[M,N]
+                # CUTLASS expects B in [N,K] format (already correct)
+                gate_A = expert_tokens  # [M, K_in]
+                gate_B = gate_weight  # [N_intermediate, K_in]
+                gate_C = torch.empty(
+                    M, N_intermediate, dtype=self.dtype_torch, device=device
+                )
+
+                problem_sizes.append([M, N_intermediate, K_in, L])
+                strides_abc.append(
+                    [
+                        [gate_A.stride(0), gate_A.stride(1)],  # A strides
+                        [gate_B.stride(0), gate_B.stride(1)],  # B strides
+                        [gate_C.stride(0), gate_C.stride(1)],  # C strides
+                    ]
+                )
+                ptrs_abc.append(
+                    [gate_A.data_ptr(), gate_B.data_ptr(), gate_C.data_ptr()]
+                )
+                output_tensors.append(gate_C)
+
+                # For up projection: same dimensions as gate
+                up_A = expert_tokens  # [M, K_in]
+                up_B = up_weight  # [N_intermediate, K_in]
+                up_C = torch.empty(
+                    M, N_intermediate, dtype=self.dtype_torch, device=device
+                )
+
+                problem_sizes.append([M, N_intermediate, K_in, L])
+                strides_abc.append(
+                    [
+                        [up_A.stride(0), up_A.stride(1)],
+                        [up_B.stride(0), up_B.stride(1)],
+                        [up_C.stride(0), up_C.stride(1)],
+                    ]
+                )
+                ptrs_abc.append([up_A.data_ptr(), up_B.data_ptr(), up_C.data_ptr()])
+                output_tensors.append(up_C)
+
+            offset += size
+
+        if not valid_groups:
+            return None, None, None, None, None
+
+        # Convert to tensors
+        problem_sizes_tensor = torch.tensor(
+            problem_sizes, dtype=torch.int32, device=device
+        )
+        strides_tensor = torch.tensor(strides_abc, dtype=torch.int32, device=device)
+        ptrs_tensor = torch.tensor(ptrs_abc, dtype=torch.int64, device=device)
+
+        # Convert to CUTE tensors
+        problem_sizes_cute = from_dlpack(problem_sizes_tensor, assumed_align=16)
+        strides_cute = from_dlpack(strides_tensor, assumed_align=16)
+        ptrs_cute = from_dlpack(ptrs_tensor, assumed_align=16)
+
+        return problem_sizes_cute, strides_cute, ptrs_cute, valid_groups, output_tensors
+
+    def _create_initial_tensors(self, tokens, weights, device):
+        """Create initial tensors for tensormap setup"""
+        # Use smallest problem size for initial setup
+        M, K = 128, tokens.shape[1]  # TODO - this is hardcoded for now
+        N = weights.shape[1]
+
+        A_init = torch.randn(M, K, dtype=self.dtype_torch, device=device)
+        B_init = torch.randn(
+            N, K, dtype=self.dtype_torch, device=device
+        )  # Note: N,K format
+        C_init = torch.zeros(M, N, dtype=self.dtype_torch, device=device)
+
+        # Convert to MNKL format and mark dynamic
+        A_mnkl = A_init.unsqueeze(-1).contiguous()
+        B_mnkl = B_init.unsqueeze(-1).contiguous()
+        C_mnkl = C_init.unsqueeze(-1).contiguous()
+
+        A_cute = from_dlpack(A_mnkl, assumed_align=16)
+        B_cute = from_dlpack(B_mnkl, assumed_align=16)
+        C_cute = from_dlpack(C_mnkl, assumed_align=16)
+
+        # Set CUTLASS data types
+        A_cute.element_type = self.dtype_cutlass
+        B_cute.element_type = self.dtype_cutlass
+        C_cute.element_type = self.dtype_cutlass
+
+        # Mark layouts as dynamic
+        A_cute = A_cute.mark_layout_dynamic(leading_dim=1)
+        B_cute = B_cute.mark_layout_dynamic(leading_dim=1)
+        C_cute = C_cute.mark_layout_dynamic(leading_dim=1)
+
+        return A_cute, B_cute, C_cute
+
+    def _setup_tensormap_buffer(self, num_groups, device):
+        """Setup tensormap buffer for CUTLASS"""
+        cache_key = (num_groups, device)
+
+        if cache_key not in self._tensormap_buffers:
+            sm_count = self.hardware_info.get_max_active_clusters(1)
+
+            tensormap_tensor = torch.zeros(
+                (sm_count, 3, 128 // 8),  # 3 tensormaps (A, B, C), 128 bytes each
+                dtype=torch.int64,
+                device=device,
+            )
+            self._tensormap_buffers[cache_key] = from_dlpack(
+                tensormap_tensor, assumed_align=16
+            )
+
+        return self._tensormap_buffers[cache_key]
+
+    def _compute_total_clusters(self, problem_sizes, cluster_shape_mn=(1, 1)):
+        """Compute total number of clusters needed"""
+        cluster_tile_m = self.mma_tiler_m  # mma_tiler_mn[0]
+        cluster_tile_n = self.mma_tiler_n  # mma_tiler_mn[1]
+
+        total = 0
+        for m, n, k, l in problem_sizes:
+            clusters_m = (m + cluster_tile_m - 1) // cluster_tile_m
+            clusters_n = (n + cluster_tile_n - 1) // cluster_tile_n
+            total += clusters_m * clusters_n
+
+        return total
+
+    def execute(self, contig_tokens, m_sizes, m_offsets, module):
+        """Execute the complete MoE forward pass using CUTLASS grouped GEMM"""
+        try:
+            # Get weights
+            w_gate = module.get_parameter("gate_proj_weight")
+            w_up = module.get_parameter("up_proj_weight")
+            w_down = module.get_parameter("down_proj_weight")
+
+            device = contig_tokens.device
+            num_valid_experts = len([s for s in m_sizes if s > 0])
+
+            if num_valid_experts == 0:
+                return torch.zeros_like(contig_tokens)
+
+            logger.info(f"CUTLASS executing with {num_valid_experts} experts")
+
+            # 1: Create metadata for gate and up projections (in theory, this can be batched)
+            gate_up_metadata = self._create_tensor_metadata(
+                contig_tokens, w_gate, w_up, w_down, m_sizes, device
+            )
+
+            if gate_up_metadata[0] is None:
+                return torch.zeros_like(contig_tokens)
+
+            (
+                problem_sizes_cute,
+                strides_cute,
+                ptrs_cute,
+                valid_groups,
+                gate_up_outputs,
+            ) = gate_up_metadata
+
+            # 2: Create initial tensors for tensormap setup
+            first_group = valid_groups[0]
+            initial_A, initial_B, initial_C = self._create_initial_tensors_from_group(
+                first_group, device
+            )
+
+            # 3: Setup tensormap buffer
+            num_operations = len(gate_up_outputs)  # gate + up operations
+            tensormap_cute = self._setup_tensormap_buffer(num_operations, device)
+
+            # 4: Compute total clusters and setup kernel
+            total_clusters = self._compute_total_clusters_from_metadata(
+                problem_sizes_cute
+            )
+
+            # 5: Execute gate and up projections
+            gate_up_results = self._execute_cutlass_grouped_gemm(
+                initial_A,
+                initial_B,
+                initial_C,
+                num_operations,
+                problem_sizes_cute,
+                strides_cute,
+                ptrs_cute,
+                total_clusters,
+                tensormap_cute,
+            )
+
+            if gate_up_results is None:
+                logger.warning(
+                    "CUTLASS kernel execution failed, falling back to manual"
+                )
+                return self._manual_fallback_full(
+                    contig_tokens, m_sizes, w_gate, w_up, w_down
+                )
+
+            # 6: Apply activation and combine gate/up results
+            intermediate_results = self._apply_activation_and_combine(
+                gate_up_outputs, valid_groups
+            )
+
+            # 7: Execute down projections
+            final_output = self._execute_down_projections(
+                intermediate_results, valid_groups, device
+            )
+
+            # 8: Reconstruct full output tensor
+            return self._reconstruct_output(final_output, m_sizes, contig_tokens)
+
+        except Exception as e:
+            print(f"CUTLASS execution failed: {e}")
+            assert (
+                False
+            ), "CUTLASS execution failed...could fall back to manual here but lets review first"
+            # Fall back to manual implementation
+            return self._manual_fallback_full(
+                contig_tokens, m_sizes, w_gate, w_up, w_down
+            )
+
+    def _create_initial_tensors_from_group(self, group_info, device):
+        """Create initial CUTE tensors from group information"""
+        M, K_in, N_intermediate = (
+            group_info["M"],
+            group_info["K_in"],
+            group_info["N_intermediate"],
+        )
+
+        # Create initial tensors with proper dimensions
+        A_init = torch.randn(M, K_in, dtype=self.dtype_torch, device=device)
+        B_init = torch.randn(
+            N_intermediate, K_in, dtype=self.dtype_torch, device=device
+        )
+        C_init = torch.zeros(M, N_intermediate, dtype=self.dtype_torch, device=device)
+
+        # Convert to MNKL format and mark dynamic
+        A_mnkl = A_init.unsqueeze(-1).contiguous()
+        B_mnkl = B_init.unsqueeze(-1).contiguous()
+        C_mnkl = C_init.unsqueeze(-1).contiguous()
+
+        A_cute = from_dlpack(A_mnkl, assumed_align=16)
+        B_cute = from_dlpack(B_mnkl, assumed_align=16)
+        C_cute = from_dlpack(C_mnkl, assumed_align=16)
+
+        # Set CUTLASS data types
+        A_cute.element_type = self.dtype_cutlass
+        B_cute.element_type = self.dtype_cutlass
+        C_cute.element_type = self.dtype_cutlass
+
+        # Mark layouts as dynamic
+        A_cute = A_cute.mark_layout_dynamic(leading_dim=1)
+        B_cute = B_cute.mark_layout_dynamic(leading_dim=1)
+        C_cute = C_cute.mark_layout_dynamic(leading_dim=1)
+
+        return A_cute, B_cute, C_cute
+
+    def _compute_total_clusters_from_metadata(self, problem_sizes_cute):
+        """Compute total clusters from problem sizes metadata"""
+        # Convert CUTE tensor back to Python list for computation
+        problem_sizes_data = (
+            problem_sizes_cute.data
+        )  # TODO - how to extract this directly from CUTE tensor
+        problem_sizes_torch = torch.tensor(
+            problem_sizes_data, dtype=torch.int32, device=device
+        )
+
+        problem_sizes_torch = problem_sizes_cute.to_torch_tensor()
+        total = 0
+
+        cluster_tile_m = 128  # From mma_tiler_mn[0]
+        cluster_tile_n = 64  # From mma_tiler_mn[1]
+
+        for i in range(problem_sizes_torch.shape[0]):
+            m, n, k, l = problem_sizes_torch[i].tolist()
+            clusters_m = (m + cluster_tile_m - 1) // cluster_tile_m
+            clusters_n = (n + cluster_tile_n - 1) // cluster_tile_n
+            total += clusters_m * clusters_n
+
+        return total
+
+    def _execute_cutlass_grouped_gemm(
+        self,
+        initial_A,
+        initial_B,
+        initial_C,
+        num_groups,
+        problem_sizes_cute,
+        strides_cute,
+        ptrs_cute,
+        total_clusters,
+        tensormap_cute,
+    ):
+        """Execute the CUTLASS grouped GEMM kernel"""
+        try:
+            # Setup CUDA stream
+            torch_stream = torch.cuda.current_stream()
+            stream = cuda.CUstream(torch_stream.cuda_stream)
+
+            # Compile kernel if not already cached
+            cache_key = (num_groups, total_clusters)
+            if cache_key not in self._compiled_kernels:
+                logger.info(
+                    f"Compiling CUTLASS kernel for {num_groups} groups, {total_clusters} clusters"
+                )
+
+                self._compiled_kernels[cache_key] = cute.compile(
+                    self.grouped_gemm,
+                    initial_A,
+                    initial_B,
+                    initial_C,
+                    num_groups,
+                    problem_sizes_cute,
+                    strides_cute,
+                    ptrs_cute,
+                    total_clusters,
+                    tensormap_cute,
+                    self.max_active_clusters,
+                    stream,
+                )
+
+                logger.info("CUTLASS kernel compilation successful")
+
+            compiled_kernel = self._compiled_kernels[cache_key]
+
+            # Execute kernel
+            logger.info(f"Executing CUTLASS grouped GEMM kernel")
+            compiled_kernel(
+                initial_A,
+                initial_B,
+                initial_C,
+                problem_sizes_cute,
+                strides_cute,
+                ptrs_cute,
+                tensormap_cute,
+                stream,
+            )
+
+            # Synchronize to ensure completion
+            torch.cuda.synchronize()
+            logger.info("CUTLASS kernel execution completed")
+
+            return True
+
+        except Exception as e:
+            logger.error(f"CUTLASS kernel execution failed: {e}")
+            return None
+
+    def _apply_activation_and_combine(self, gate_up_outputs, valid_groups):
+        """Apply activation function and combine gate/up projection results"""
+        intermediate_results = []
+
+        # gate_up_outputs contains interleaved gate and up results
+        for i in range(0, len(gate_up_outputs), 2):
+            gate_output = gate_up_outputs[i]
+            up_output = gate_up_outputs[i + 1]
+
+            # Apply activation to gate output and multiply with up output
+            activated_gate = self.activation_function(gate_output)
+            combined = activated_gate * up_output
+
+            intermediate_results.append(combined)
+
+        return intermediate_results
+
+    def _execute_down_projection(self, hidden_states, w_down, m_sizes, device):
+        """Execute down projection using grouped GEMM"""
+        problem_sizes = []
+        strides_abc = []
+        ptrs_abc = []
+        down_outputs = []
+
+        expert_idx = 0
+        for size in m_sizes:
+            if size > 0 and expert_idx < len(hidden_states):
+                hidden = hidden_states[expert_idx]
+                down_weight = w_down[
+                    expert_idx
+                ].contiguous()  # [hidden_size, intermediate_size]
+
+                M, K = hidden.shape
+                N = down_weight.shape[0]  # hidden_size
+                L = 1
+
+                # Create output tensor
+                down_output = torch.empty(M, N, dtype=self.dtype_torch, device=device)
+
+                # Convert to MNKL format (following bench_group_gemm.py)
+                hidden_mnkl = hidden.unsqueeze(-1).contiguous()  # (M, K, 1)
+                down_weight_mnkl = down_weight.unsqueeze(-1).contiguous()  # (N, K, 1)
+                down_output_mnkl = down_output.unsqueeze(-1).contiguous()  # (M, N, 1)
+
+                # Extract 2D strides from MNKL tensors (following bench_group_gemm.py)
+                hidden_strides = hidden_mnkl.stride()[:2]
+                down_weight_strides = down_weight_mnkl.stride()[:2]
+                down_output_strides = down_output_mnkl.stride()[:2]
+
+                # Down projection metadata
+                problem_sizes.append([M, N, K, L])
+                strides_abc.append(
+                    [
+                        list(hidden_strides),  # A strides
+                        list(down_weight_strides),  # B strides
+                        list(down_output_strides),  # C strides
+                    ]
+                )
+                ptrs_abc.append(
+                    [hidden.data_ptr(), down_weight.data_ptr(), down_output.data_ptr()]
+                )
+
+                down_outputs.append(down_output)
+                expert_idx += 1
+
+        if not problem_sizes:
+            return []
+
+        # Execute grouped GEMM for down projection
+        self._execute_grouped_gemm(problem_sizes, strides_abc, ptrs_abc, device)
+
+        return down_outputs
+
+    def _reconstruct_output(self, final_outputs, m_sizes, contig_tokens):
+        """Reconstruct the full output tensor from expert results"""
+        total_tokens = sum(m_sizes)
+        hidden_size = (
+            final_outputs[0].shape[1] if final_outputs else contig_tokens.shape[1]
+        )
+
+        output = torch.zeros(
+            total_tokens,
+            hidden_size,
+            dtype=contig_tokens.dtype,
+            device=contig_tokens.device,
+        )
+
+        output_idx = 0
+        result_idx = 0
+
+        for size in m_sizes:
+            if size > 0:
+                if result_idx < len(final_outputs):
+                    output[output_idx : output_idx + size] = final_outputs[result_idx]
+                    result_idx += 1
+                output_idx += size
+
+        return output
+
+    def _manual_fallback_full(self, tokens, m_sizes, w_gate, w_up, w_down):
+        """Complete manual fallback implementation"""
+        total_tokens = sum(m_sizes)
+        hidden_size = w_gate.shape[2] if len(w_gate.shape) > 2 else w_gate.shape[1]
+
+        output = torch.zeros(
+            total_tokens, hidden_size, dtype=tokens.dtype, device=tokens.device
+        )
+
+        offset = 0
+        expert_idx = 0
+
+        for size in m_sizes:
+            if size > 0:
+                if expert_idx < w_gate.shape[0]:  # Check bounds
+                    expert_tokens = tokens[offset : offset + size]
+
+                    # Forward pass through expert
+                    gate_out = torch.mm(expert_tokens, w_gate[expert_idx].t())
+                    up_out = torch.mm(expert_tokens, w_up[expert_idx].t())
+                    hidden = self.activation_function(gate_out) * up_out
+                    expert_output = torch.mm(hidden, w_down[expert_idx].t())
+
+                    output[offset : offset + size] = expert_output
+                    expert_idx += 1
+
+            offset += size
+
+        return output
+
+    @staticmethod
+    def is_available() -> bool:
+        return CUTLASS_AVAILABLE
+
+
+# ========================= end of CUTLASSGroupedGemmStrategy =========================
+
+
+class CUTLASSGroupedGemmStrategy_prev(GroupGEMMStrategy):
     """
     Strategy using CUTLASS GroupedGemmKernel for group GEMM operations
 

From 8523b72b53a1f0bb2c2ea2811f6ea669affb2f59 Mon Sep 17 00:00:00 2001
From: lessw2020 <lessw@etrillium.com>
Date: Wed, 11 Jun 2025 14:12:33 -0700
Subject: [PATCH 08/34] refactor cutlass group gemm for 2UMMA support and
 streamlined code

---
 .../experiments/deepseek_v3/group_gemms.py    | 1197 ++++++-----------
 .../llama3/train_configs/debug_model.toml     |    4 +-
 2 files changed, 415 insertions(+), 786 deletions(-)

diff --git a/torchtitan/experiments/deepseek_v3/group_gemms.py b/torchtitan/experiments/deepseek_v3/group_gemms.py
index f05d5fcab..3226964a7 100644
--- a/torchtitan/experiments/deepseek_v3/group_gemms.py
+++ b/torchtitan/experiments/deepseek_v3/group_gemms.py
@@ -188,827 +188,477 @@ def is_available() -> bool:
 
 
 class CUTLASSGroupedGemmStrategy(GroupGEMMStrategy):
-    """Fixed CUTLASS GroupedGemmKernel implementation"""
-
-    def __init__(self, custom_activation):
-        super().__init__(custom_activation)
-        self.dtype_torch = torch.bfloat16
-        self.dtype_cutlass = cutlass.BFloat16
-        self.acc_dtype = cutlass.Float32
-        self.mma_tiler_m = 128  # can only be 128 or 256
-        self.mma_tiler_n = 64  # can only be 64 or 128
-
-        print("Initializing CUTLASS GroupedGemmKernel")
-
-        # Kernel configuration
-        self.grouped_gemm = GroupedGemmKernel(
-            acc_dtype=self.acc_dtype,
-            use_2cta_instrs=False,
-            mma_tiler_mn=(self.mma_tiler_m, self.mma_tiler_n),
-            cluster_shape_mn=(1, 1),
-            tensormap_update_mode=utils.TensorMapUpdateMode.SMEM,
-        )
-
-        # Hardware info
-        self.hardware_info = utils.HardwareInfo()
-        self.max_active_clusters = self.hardware_info.get_max_active_clusters(1)
-
-        # Buffers
-        self._tensormap_buffers = {}
-        self._compiled_kernels = {}
-
-    def arrange_expert_weights(self, all_weights, submod_name, module):
-        """Prepare expert weights for CUTLASS grouped GEMM"""
-        # Stack weights: [num_experts, out_dim, in_dim]
-        combined_weights = torch.stack(all_weights)
-        print(f"CUTLASS arranged weights {submod_name}: {combined_weights.shape}")
-        return combined_weights
-
-    def _create_tensor_metadata(self, tokens, w_gate, w_up, w_down, m_sizes, device):
-        """Create the metadata tensors required by CUTLASS GroupedGEMM"""
-        # Filter out empty groups and create contiguous data
-        valid_groups = []
-        problem_sizes = []
-        strides_abc = []
-        ptrs_abc = []
-        output_tensors = []
-
-        offset = 0
-        for expert_idx, size in enumerate(m_sizes):
-            if size > 0:
-                # Get input tokens for this expert
-                expert_tokens = tokens[offset : offset + size].contiguous()
-
-                # Get weights for this expert
-                gate_weight = w_gate[
-                    expert_idx
-                ].contiguous()  # [intermediate_size, hidden_size]
-                up_weight = w_up[expert_idx].contiguous()
-                down_weight = w_down[
-                    expert_idx
-                ].contiguous()  # [hidden_size, intermediate_size]
-
-                # Create tensors for each GEMM operation in MoE forward pass
-                M = size  # Number of tokens for this expert
-                K_in = expert_tokens.shape[1]  # hidden_size
-                N_intermediate = gate_weight.shape[0]  # intermediate_size
-                N_out = down_weight.shape[0]  # hidden_size (output)
-                L = 1  # Batch dimension
-
-                # Store group info
-                group_info = {
-                    "expert_idx": expert_idx,
-                    "tokens": expert_tokens,
-                    "gate_weight": gate_weight,
-                    "up_weight": up_weight,
-                    "down_weight": down_weight,
-                    "M": M,
-                    "K_in": K_in,
-                    "N_intermediate": N_intermediate,
-                    "N_out": N_out,
-                }
-                valid_groups.append(group_info)
-
-                # For gate projection: A=[M,K], B=[N,K], C=[M,N]
-                # CUTLASS expects B in [N,K] format (already correct)
-                gate_A = expert_tokens  # [M, K_in]
-                gate_B = gate_weight  # [N_intermediate, K_in]
-                gate_C = torch.empty(
-                    M, N_intermediate, dtype=self.dtype_torch, device=device
-                )
-
-                problem_sizes.append([M, N_intermediate, K_in, L])
-                strides_abc.append(
-                    [
-                        [gate_A.stride(0), gate_A.stride(1)],  # A strides
-                        [gate_B.stride(0), gate_B.stride(1)],  # B strides
-                        [gate_C.stride(0), gate_C.stride(1)],  # C strides
-                    ]
-                )
-                ptrs_abc.append(
-                    [gate_A.data_ptr(), gate_B.data_ptr(), gate_C.data_ptr()]
-                )
-                output_tensors.append(gate_C)
-
-                # For up projection: same dimensions as gate
-                up_A = expert_tokens  # [M, K_in]
-                up_B = up_weight  # [N_intermediate, K_in]
-                up_C = torch.empty(
-                    M, N_intermediate, dtype=self.dtype_torch, device=device
-                )
-
-                problem_sizes.append([M, N_intermediate, K_in, L])
-                strides_abc.append(
-                    [
-                        [up_A.stride(0), up_A.stride(1)],
-                        [up_B.stride(0), up_B.stride(1)],
-                        [up_C.stride(0), up_C.stride(1)],
-                    ]
-                )
-                ptrs_abc.append([up_A.data_ptr(), up_B.data_ptr(), up_C.data_ptr()])
-                output_tensors.append(up_C)
-
-            offset += size
-
-        if not valid_groups:
-            return None, None, None, None, None
-
-        # Convert to tensors
-        problem_sizes_tensor = torch.tensor(
-            problem_sizes, dtype=torch.int32, device=device
-        )
-        strides_tensor = torch.tensor(strides_abc, dtype=torch.int32, device=device)
-        ptrs_tensor = torch.tensor(ptrs_abc, dtype=torch.int64, device=device)
-
-        # Convert to CUTE tensors
-        problem_sizes_cute = from_dlpack(problem_sizes_tensor, assumed_align=16)
-        strides_cute = from_dlpack(strides_tensor, assumed_align=16)
-        ptrs_cute = from_dlpack(ptrs_tensor, assumed_align=16)
-
-        return problem_sizes_cute, strides_cute, ptrs_cute, valid_groups, output_tensors
-
-    def _create_initial_tensors(self, tokens, weights, device):
-        """Create initial tensors for tensormap setup"""
-        # Use smallest problem size for initial setup
-        M, K = 128, tokens.shape[1]  # TODO - this is hardcoded for now
-        N = weights.shape[1]
-
-        A_init = torch.randn(M, K, dtype=self.dtype_torch, device=device)
-        B_init = torch.randn(
-            N, K, dtype=self.dtype_torch, device=device
-        )  # Note: N,K format
-        C_init = torch.zeros(M, N, dtype=self.dtype_torch, device=device)
-
-        # Convert to MNKL format and mark dynamic
-        A_mnkl = A_init.unsqueeze(-1).contiguous()
-        B_mnkl = B_init.unsqueeze(-1).contiguous()
-        C_mnkl = C_init.unsqueeze(-1).contiguous()
-
-        A_cute = from_dlpack(A_mnkl, assumed_align=16)
-        B_cute = from_dlpack(B_mnkl, assumed_align=16)
-        C_cute = from_dlpack(C_mnkl, assumed_align=16)
-
-        # Set CUTLASS data types
-        A_cute.element_type = self.dtype_cutlass
-        B_cute.element_type = self.dtype_cutlass
-        C_cute.element_type = self.dtype_cutlass
-
-        # Mark layouts as dynamic
-        A_cute = A_cute.mark_layout_dynamic(leading_dim=1)
-        B_cute = B_cute.mark_layout_dynamic(leading_dim=1)
-        C_cute = C_cute.mark_layout_dynamic(leading_dim=1)
-
-        return A_cute, B_cute, C_cute
-
-    def _setup_tensormap_buffer(self, num_groups, device):
-        """Setup tensormap buffer for CUTLASS"""
-        cache_key = (num_groups, device)
-
-        if cache_key not in self._tensormap_buffers:
-            sm_count = self.hardware_info.get_max_active_clusters(1)
-
-            tensormap_tensor = torch.zeros(
-                (sm_count, 3, 128 // 8),  # 3 tensormaps (A, B, C), 128 bytes each
-                dtype=torch.int64,
-                device=device,
-            )
-            self._tensormap_buffers[cache_key] = from_dlpack(
-                tensormap_tensor, assumed_align=16
-            )
-
-        return self._tensormap_buffers[cache_key]
-
-    def _compute_total_clusters(self, problem_sizes, cluster_shape_mn=(1, 1)):
-        """Compute total number of clusters needed"""
-        cluster_tile_m = self.mma_tiler_m  # mma_tiler_mn[0]
-        cluster_tile_n = self.mma_tiler_n  # mma_tiler_mn[1]
-
-        total = 0
-        for m, n, k, l in problem_sizes:
-            clusters_m = (m + cluster_tile_m - 1) // cluster_tile_m
-            clusters_n = (n + cluster_tile_n - 1) // cluster_tile_n
-            total += clusters_m * clusters_n
-
-        return total
-
-    def execute(self, contig_tokens, m_sizes, m_offsets, module):
-        """Execute the complete MoE forward pass using CUTLASS grouped GEMM"""
-        try:
-            # Get weights
-            w_gate = module.get_parameter("gate_proj_weight")
-            w_up = module.get_parameter("up_proj_weight")
-            w_down = module.get_parameter("down_proj_weight")
-
-            device = contig_tokens.device
-            num_valid_experts = len([s for s in m_sizes if s > 0])
-
-            if num_valid_experts == 0:
-                return torch.zeros_like(contig_tokens)
-
-            logger.info(f"CUTLASS executing with {num_valid_experts} experts")
-
-            # 1: Create metadata for gate and up projections (in theory, this can be batched)
-            gate_up_metadata = self._create_tensor_metadata(
-                contig_tokens, w_gate, w_up, w_down, m_sizes, device
-            )
-
-            if gate_up_metadata[0] is None:
-                return torch.zeros_like(contig_tokens)
-
-            (
-                problem_sizes_cute,
-                strides_cute,
-                ptrs_cute,
-                valid_groups,
-                gate_up_outputs,
-            ) = gate_up_metadata
-
-            # 2: Create initial tensors for tensormap setup
-            first_group = valid_groups[0]
-            initial_A, initial_B, initial_C = self._create_initial_tensors_from_group(
-                first_group, device
-            )
-
-            # 3: Setup tensormap buffer
-            num_operations = len(gate_up_outputs)  # gate + up operations
-            tensormap_cute = self._setup_tensormap_buffer(num_operations, device)
-
-            # 4: Compute total clusters and setup kernel
-            total_clusters = self._compute_total_clusters_from_metadata(
-                problem_sizes_cute
-            )
-
-            # 5: Execute gate and up projections
-            gate_up_results = self._execute_cutlass_grouped_gemm(
-                initial_A,
-                initial_B,
-                initial_C,
-                num_operations,
-                problem_sizes_cute,
-                strides_cute,
-                ptrs_cute,
-                total_clusters,
-                tensormap_cute,
-            )
-
-            if gate_up_results is None:
-                logger.warning(
-                    "CUTLASS kernel execution failed, falling back to manual"
-                )
-                return self._manual_fallback_full(
-                    contig_tokens, m_sizes, w_gate, w_up, w_down
-                )
-
-            # 6: Apply activation and combine gate/up results
-            intermediate_results = self._apply_activation_and_combine(
-                gate_up_outputs, valid_groups
-            )
-
-            # 7: Execute down projections
-            final_output = self._execute_down_projections(
-                intermediate_results, valid_groups, device
-            )
-
-            # 8: Reconstruct full output tensor
-            return self._reconstruct_output(final_output, m_sizes, contig_tokens)
-
-        except Exception as e:
-            print(f"CUTLASS execution failed: {e}")
-            assert (
-                False
-            ), "CUTLASS execution failed...could fall back to manual here but lets review first"
-            # Fall back to manual implementation
-            return self._manual_fallback_full(
-                contig_tokens, m_sizes, w_gate, w_up, w_down
-            )
+    """
+    Strategy using CUTLASS GroupedGemmKernel for group GEMM operations on Blackwell architecture.
 
-    def _create_initial_tensors_from_group(self, group_info, device):
-        """Create initial CUTE tensors from group information"""
-        M, K_in, N_intermediate = (
-            group_info["M"],
-            group_info["K_in"],
-            group_info["N_intermediate"],
-        )
+    Supports both single and dual CTA instruction modes with flexible cluster configurations.
 
-        # Create initial tensors with proper dimensions
-        A_init = torch.randn(M, K_in, dtype=self.dtype_torch, device=device)
-        B_init = torch.randn(
-            N_intermediate, K_in, dtype=self.dtype_torch, device=device
-        )
-        C_init = torch.zeros(M, N_intermediate, dtype=self.dtype_torch, device=device)
-
-        # Convert to MNKL format and mark dynamic
-        A_mnkl = A_init.unsqueeze(-1).contiguous()
-        B_mnkl = B_init.unsqueeze(-1).contiguous()
-        C_mnkl = C_init.unsqueeze(-1).contiguous()
-
-        A_cute = from_dlpack(A_mnkl, assumed_align=16)
-        B_cute = from_dlpack(B_mnkl, assumed_align=16)
-        C_cute = from_dlpack(C_mnkl, assumed_align=16)
-
-        # Set CUTLASS data types
-        A_cute.element_type = self.dtype_cutlass
-        B_cute.element_type = self.dtype_cutlass
-        C_cute.element_type = self.dtype_cutlass
-
-        # Mark layouts as dynamic
-        A_cute = A_cute.mark_layout_dynamic(leading_dim=1)
-        B_cute = B_cute.mark_layout_dynamic(leading_dim=1)
-        C_cute = C_cute.mark_layout_dynamic(leading_dim=1)
-
-        return A_cute, B_cute, C_cute
-
-    def _compute_total_clusters_from_metadata(self, problem_sizes_cute):
-        """Compute total clusters from problem sizes metadata"""
-        # Convert CUTE tensor back to Python list for computation
-        problem_sizes_data = (
-            problem_sizes_cute.data
-        )  # TODO - how to extract this directly from CUTE tensor
-        problem_sizes_torch = torch.tensor(
-            problem_sizes_data, dtype=torch.int32, device=device
-        )
+    Supported cluster shapes: (1,1), (1,2), (1,4), (2,1), (2,2), (2,4), (4,1), (4,2), (4,4)
 
-        problem_sizes_torch = problem_sizes_cute.to_torch_tensor()
-        total = 0
+    Usage Examples:
 
-        cluster_tile_m = 128  # From mma_tiler_mn[0]
-        cluster_tile_n = 64  # From mma_tiler_mn[1]
+    # Basic single CTA mode
+    strategy = CUTLASSGroupedGemmStrategy(custom_activation)
 
-        for i in range(problem_sizes_torch.shape[0]):
-            m, n, k, l = problem_sizes_torch[i].tolist()
-            clusters_m = (m + cluster_tile_m - 1) // cluster_tile_m
-            clusters_n = (n + cluster_tile_n - 1) // cluster_tile_n
-            total += clusters_m * clusters_n
+    # 2 CTA mode with default settings
+    strategy = CUTLASSGroupedGemmStrategy(custom_activation, use_2cta_instrs=True)
 
-        return total
+    # High-performance configuration with large cluster
+    strategy = CUTLASSGroupedGemmStrategy(
+        custom_activation,
+        use_2cta_instrs=True,
+        mma_tiler_mn=(256, 128),
+        cluster_shape_mn=(4, 4)
+    )
+    """
 
-    def _execute_cutlass_grouped_gemm(
+    # Constants
+    SUPPORTED_CLUSTER_SHAPES = [
+        (1, 1),
+        (1, 2),
+        (1, 4),
+        (2, 1),
+        (2, 2),
+        (2, 4),
+        (4, 1),
+        (4, 2),
+        (4, 4),
+    ]
+
+    SINGLE_CTA_M_SIZES = [64, 128]
+    DUAL_CTA_M_SIZES = [128, 256]
+    N_SIZE_RANGE = range(32, 257, 32)
+
+    DTYPE_TORCH = torch.bfloat16
+    DTYPE_CUTLASS = cutlass.BFloat16
+    ACC_DTYPE = cutlass.Float32
+    ALIGNMENT = 16
+    TENSORMAP_COUNT = 3
+    TENSORMAP_BYTES = 128
+
+    def __init__(
         self,
-        initial_A,
-        initial_B,
-        initial_C,
-        num_groups,
-        problem_sizes_cute,
-        strides_cute,
-        ptrs_cute,
-        total_clusters,
-        tensormap_cute,
+        custom_activation,
+        use_2cta_instrs=False,
+        mma_tiler_mn=None,
+        cluster_shape_mn=None,
     ):
-        """Execute the CUTLASS grouped GEMM kernel"""
-        try:
-            # Setup CUDA stream
-            torch_stream = torch.cuda.current_stream()
-            stream = cuda.CUstream(torch_stream.cuda_stream)
-
-            # Compile kernel if not already cached
-            cache_key = (num_groups, total_clusters)
-            if cache_key not in self._compiled_kernels:
-                logger.info(
-                    f"Compiling CUTLASS kernel for {num_groups} groups, {total_clusters} clusters"
-                )
-
-                self._compiled_kernels[cache_key] = cute.compile(
-                    self.grouped_gemm,
-                    initial_A,
-                    initial_B,
-                    initial_C,
-                    num_groups,
-                    problem_sizes_cute,
-                    strides_cute,
-                    ptrs_cute,
-                    total_clusters,
-                    tensormap_cute,
-                    self.max_active_clusters,
-                    stream,
-                )
-
-                logger.info("CUTLASS kernel compilation successful")
-
-            compiled_kernel = self._compiled_kernels[cache_key]
-
-            # Execute kernel
-            logger.info(f"Executing CUTLASS grouped GEMM kernel")
-            compiled_kernel(
-                initial_A,
-                initial_B,
-                initial_C,
-                problem_sizes_cute,
-                strides_cute,
-                ptrs_cute,
-                tensormap_cute,
-                stream,
-            )
-
-            # Synchronize to ensure completion
-            torch.cuda.synchronize()
-            logger.info("CUTLASS kernel execution completed")
-
-            return True
-
-        except Exception as e:
-            logger.error(f"CUTLASS kernel execution failed: {e}")
-            return None
-
-    def _apply_activation_and_combine(self, gate_up_outputs, valid_groups):
-        """Apply activation function and combine gate/up projection results"""
-        intermediate_results = []
-
-        # gate_up_outputs contains interleaved gate and up results
-        for i in range(0, len(gate_up_outputs), 2):
-            gate_output = gate_up_outputs[i]
-            up_output = gate_up_outputs[i + 1]
+        """
+        Initialize the CUTLASS grouped GEMM strategy for Blackwell architecture.
 
-            # Apply activation to gate output and multiply with up output
-            activated_gate = self.activation_function(gate_output)
-            combined = activated_gate * up_output
+        Args:
+            custom_activation: The activation function to use
+            use_2cta_instrs (bool): Whether to use 2 CTA instructions for enhanced performance
+            mma_tiler_mn (tuple, optional): MMA tile shape (M, N). If None, uses Blackwell-optimized defaults
+            cluster_shape_mn (tuple, optional): Cluster shape (M, N). If None, uses optimized defaults
+        """
+        super().__init__(custom_activation)
+        self.use_2cta_instrs = use_2cta_instrs
 
-            intermediate_results.append(combined)
+        # Set configuration
+        self.mma_tiler_mn = mma_tiler_mn or self._get_default_mma_tiler()
+        self.cluster_shape_mn = cluster_shape_mn or self._get_default_cluster_shape()
 
-        return intermediate_results
+        # Validate configurations
+        self._validate_configurations()
 
-    def _execute_down_projection(self, hidden_states, w_down, m_sizes, device):
-        """Execute down projection using grouped GEMM"""
-        problem_sizes = []
-        strides_abc = []
-        ptrs_abc = []
-        down_outputs = []
+        # Initialize kernel and hardware info
+        self._initialize_kernel()
+        self._initialize_hardware()
 
-        expert_idx = 0
-        for size in m_sizes:
-            if size > 0 and expert_idx < len(hidden_states):
-                hidden = hidden_states[expert_idx]
-                down_weight = w_down[
-                    expert_idx
-                ].contiguous()  # [hidden_size, intermediate_size]
-
-                M, K = hidden.shape
-                N = down_weight.shape[0]  # hidden_size
-                L = 1
-
-                # Create output tensor
-                down_output = torch.empty(M, N, dtype=self.dtype_torch, device=device)
-
-                # Convert to MNKL format (following bench_group_gemm.py)
-                hidden_mnkl = hidden.unsqueeze(-1).contiguous()  # (M, K, 1)
-                down_weight_mnkl = down_weight.unsqueeze(-1).contiguous()  # (N, K, 1)
-                down_output_mnkl = down_output.unsqueeze(-1).contiguous()  # (M, N, 1)
-
-                # Extract 2D strides from MNKL tensors (following bench_group_gemm.py)
-                hidden_strides = hidden_mnkl.stride()[:2]
-                down_weight_strides = down_weight_mnkl.stride()[:2]
-                down_output_strides = down_output_mnkl.stride()[:2]
-
-                # Down projection metadata
-                problem_sizes.append([M, N, K, L])
-                strides_abc.append(
-                    [
-                        list(hidden_strides),  # A strides
-                        list(down_weight_strides),  # B strides
-                        list(down_output_strides),  # C strides
-                    ]
-                )
-                ptrs_abc.append(
-                    [hidden.data_ptr(), down_weight.data_ptr(), down_output.data_ptr()]
-                )
-
-                down_outputs.append(down_output)
-                expert_idx += 1
+        # Initialize caches
+        self._compiled_kernels = {}
+        self._tensormap_buffers = {}
 
-        if not problem_sizes:
-            return []
+        self._log_initialization()
 
-        # Execute grouped GEMM for down projection
-        self._execute_grouped_gemm(problem_sizes, strides_abc, ptrs_abc, device)
+    def _get_default_mma_tiler(self):
+        """Get default MMA tiler configuration based on CTA mode."""
+        return (256, 128) if self.use_2cta_instrs else (128, 128)
 
-        return down_outputs
+    def _get_default_cluster_shape(self):
+        """Get default cluster shape based on CTA mode."""
+        return (2, 2) if self.use_2cta_instrs else (1, 1)
 
-    def _reconstruct_output(self, final_outputs, m_sizes, contig_tokens):
-        """Reconstruct the full output tensor from expert results"""
-        total_tokens = sum(m_sizes)
-        hidden_size = (
-            final_outputs[0].shape[1] if final_outputs else contig_tokens.shape[1]
+    def _initialize_kernel(self):
+        """Initialize the CUTLASS grouped GEMM kernel."""
+        self.grouped_gemm = GroupedGemmKernel(
+            acc_dtype=self.ACC_DTYPE,
+            use_2cta_instrs=self.use_2cta_instrs,
+            mma_tiler_mn=self.mma_tiler_mn,
+            cluster_shape_mn=self.cluster_shape_mn,
+            tensormap_update_mode=utils.TensorMapUpdateMode.SMEM,
         )
 
-        output = torch.zeros(
-            total_tokens,
-            hidden_size,
-            dtype=contig_tokens.dtype,
-            device=contig_tokens.device,
+    def _initialize_hardware(self):
+        """Initialize hardware information and stream."""
+        self.hardware_info = utils.HardwareInfo()
+        self.max_active_clusters = self.hardware_info.get_max_active_clusters(
+            self.cluster_shape_mn[0] * self.cluster_shape_mn[1]
         )
 
-        output_idx = 0
-        result_idx = 0
-
-        for size in m_sizes:
-            if size > 0:
-                if result_idx < len(final_outputs):
-                    output[output_idx : output_idx + size] = final_outputs[result_idx]
-                    result_idx += 1
-                output_idx += size
+        torch_stream = torch.cuda.current_stream()
+        self.stream = cuda.CUstream(torch_stream.cuda_stream)
 
-        return output
+    def _validate_configurations(self):
+        """Validate that the configurations are compatible with Blackwell and the selected mode."""
+        self._validate_mma_tiler()
+        self._validate_cluster_shape()
+        self._validate_2cta_constraints()
 
-    def _manual_fallback_full(self, tokens, m_sizes, w_gate, w_up, w_down):
-        """Complete manual fallback implementation"""
-        total_tokens = sum(m_sizes)
-        hidden_size = w_gate.shape[2] if len(w_gate.shape) > 2 else w_gate.shape[1]
+    def _validate_mma_tiler(self):
+        """Validate MMA tiler configuration."""
+        m_size, n_size = self.mma_tiler_mn
 
-        output = torch.zeros(
-            total_tokens, hidden_size, dtype=tokens.dtype, device=tokens.device
+        valid_m_sizes = (
+            self.DUAL_CTA_M_SIZES if self.use_2cta_instrs else self.SINGLE_CTA_M_SIZES
         )
+        mode_name = "2 CTA" if self.use_2cta_instrs else "single CTA"
 
-        offset = 0
-        expert_idx = 0
-
-        for size in m_sizes:
-            if size > 0:
-                if expert_idx < w_gate.shape[0]:  # Check bounds
-                    expert_tokens = tokens[offset : offset + size]
-
-                    # Forward pass through expert
-                    gate_out = torch.mm(expert_tokens, w_gate[expert_idx].t())
-                    up_out = torch.mm(expert_tokens, w_up[expert_idx].t())
-                    hidden = self.activation_function(gate_out) * up_out
-                    expert_output = torch.mm(hidden, w_down[expert_idx].t())
-
-                    output[offset : offset + size] = expert_output
-                    expert_idx += 1
-
-            offset += size
-
-        return output
-
-    @staticmethod
-    def is_available() -> bool:
-        return CUTLASS_AVAILABLE
-
-
-# ========================= end of CUTLASSGroupedGemmStrategy =========================
-
-
-class CUTLASSGroupedGemmStrategy_prev(GroupGEMMStrategy):
-    """
-    Strategy using CUTLASS GroupedGemmKernel for group GEMM operations
-
-    """
-
-    def __init__(self, custom_activation):
-        super().__init__(custom_activation)
-        self.dtype_torch = torch.bfloat16
-        self.dtype_cutlass = cutlass.BFloat16
-        self.acc_dtype = cutlass.Float32
-        self.alignment = 16
-
-        # Create grouped GEMM kernel
-        self.grouped_gemm = GroupedGemmKernel(
-            acc_dtype=self.acc_dtype,
-            use_2cta_instrs=False,
-            mma_tiler_mn=(128, 64),
-            cluster_shape_mn=(1, 1),
-            tensormap_update_mode=utils.TensorMapUpdateMode.SMEM,
-        )
+        if m_size not in valid_m_sizes:
+            raise ValueError(
+                f"For {mode_name} mode on Blackwell, MMA tiler M must be in {valid_m_sizes}, got {m_size}"
+            )
 
-        # Setup hardware info and stream
-        self.hardware_info = utils.HardwareInfo()
-        self.max_active_clusters = self.hardware_info.get_max_active_clusters(1)
+        if n_size not in self.N_SIZE_RANGE:
+            raise ValueError(
+                f"MMA tiler N must be in range [32, 256] with step 32, got {n_size}"
+            )
 
-        torch_stream = torch.cuda.current_stream()
-        self.stream = cuda.CUstream(torch_stream.cuda_stream)
+    def _validate_cluster_shape(self):
+        """Validate cluster shape configuration."""
+        if self.cluster_shape_mn not in self.SUPPORTED_CLUSTER_SHAPES:
+            raise ValueError(
+                f"Cluster shape {self.cluster_shape_mn} not supported on Blackwell. "
+                f"Valid cluster shapes are: {self.SUPPORTED_CLUSTER_SHAPES}"
+            )
 
-        # Cache for compiled kernels and tensormap buffers
-        self._compiled_kernels = {}
-        self._tensormap_buffers = {}
+    def _validate_2cta_constraints(self):
+        """Validate 2 CTA specific constraints."""
+        if self.use_2cta_instrs and self.cluster_shape_mn[0] % 2 != 0:
+            valid_2cta_shapes = [
+                shape for shape in self.SUPPORTED_CLUSTER_SHAPES if shape[0] % 2 == 0
+            ]
+            raise ValueError(
+                f"For 2 CTA mode, cluster shape M must be even, got {self.cluster_shape_mn[0]}. "
+                f"Valid 2 CTA cluster shapes: {valid_2cta_shapes}"
+            )
 
-        print("Initialized CUTLASSGroupedGemmStrategy with GroupedGemmKernel")
+    def _log_initialization(self):
+        """Log initialization information."""
+        cluster_size = self.cluster_shape_mn[0] * self.cluster_shape_mn[1]
+        print(f"Initialized CUTLASSGroupedGemmStrategy for Blackwell with:")
+        print(f"  - 2 CTA instructions: {self.use_2cta_instrs}")
+        print(f"  - MMA tiler (M, N): {self.mma_tiler_mn}")
+        print(f"  - Cluster shape (M, N): {self.cluster_shape_mn}")
+        print(f"  - Cluster size: {cluster_size}")
+        if cluster_size > 1:
+            print(f"  - Using multi-cluster parallelism for enhanced performance")
 
     def arrange_expert_weights(self, all_weights, submod_name, module):
-        """Store weights in stacked format"""
+        """Store weights in stacked format."""
         return torch.stack(all_weights)
 
     def execute(self, contig_tokens, m_sizes, m_offsets, module):
-        """Execute using CUTLASS grouped GEMM kernel"""
-        # Get weights
-        w_gate = module.get_parameter("gate_proj_weight")
-        w_up = module.get_parameter("up_proj_weight")
-        w_down = module.get_parameter("down_proj_weight")
+        """Execute using CUTLASS grouped GEMM kernel."""
+        # Validate inputs
+        self._validate_inputs(contig_tokens, m_sizes, module)
 
+        # Get weights
+        weights = self._get_weights(module)
         device = contig_tokens.device
-        hidden_size = w_gate.shape[2]
 
         # Prepare output tensor
         output = torch.zeros(
-            contig_tokens.shape[0], hidden_size, dtype=self.dtype_torch, device=device
+            contig_tokens.shape[0],
+            weights["gate"].shape[2],
+            dtype=self.DTYPE_TORCH,
+            device=device,
         )
 
-        # Filter valid experts
-        valid_experts = [(i, size) for i, size in enumerate(m_sizes) if size > 0]
-        if not valid_experts:
+        # Check for valid experts
+        if not any(size > 0 for size in m_sizes):
             return output
 
-        # Step 1: Execute gate and up projections using grouped GEMM
-        gate_outputs, up_outputs = self._execute_gate_up_projections(
-            contig_tokens, w_gate, w_up, m_sizes, device
+        # Execute the three-stage computation
+        gate_outputs, up_outputs = self._execute_projections(
+            contig_tokens, weights["gate"], weights["up"], m_sizes, device
         )
 
-        # Step 2: Apply activation and combine
-        hidden_states = self._apply_activation_and_combine(
-            gate_outputs, up_outputs, m_sizes
-        )
+        hidden_states = self._apply_activation_and_combine(gate_outputs, up_outputs)
 
-        # Step 3: Execute down projection using grouped GEMM
-        final_outputs = self._execute_down_projection(
-            hidden_states, w_down, m_sizes, device
-        )
+        final_outputs = self._execute_projections(
+            hidden_states, weights["down"], None, m_sizes, device, is_down_proj=True
+        )[
+            0
+        ]  # Only return first element for down projection
 
-        # Step 4: Reconstruct full output
         return self._reconstruct_output(final_outputs, m_sizes, output)
 
-    def _execute_gate_up_projections(
-        self, contig_tokens, w_gate, w_up, m_sizes, device
+    def _validate_inputs(self, contig_tokens, m_sizes, module):
+        """Validate input parameters."""
+        if contig_tokens.dtype != self.DTYPE_TORCH:
+            raise ValueError(
+                f"Expected input dtype {self.DTYPE_TORCH}, got {contig_tokens.dtype}"
+            )
+
+        if len(contig_tokens.shape) != 2:
+            raise ValueError(
+                f"Expected 2D input tensor, got shape {contig_tokens.shape}"
+            )
+
+        # required_params = ["gate_proj_weight", "up_proj_weight", "down_proj_weight"]
+        # for param in required_params:
+        #    if not hasattr(module, param) or module.get_parameter(param) is None:
+        #        raise ValueError(f"Module missing required parameter: {param}")
+
+    def _get_weights(self, module):
+        """Extract and return weight tensors from module."""
+        return {
+            "gate": module.get_parameter("gate_proj_weight"),
+            "up": module.get_parameter("up_proj_weight"),
+            "down": module.get_parameter("down_proj_weight"),
+        }
+
+    def _execute_projections(
+        self, input_tokens, weight1, weight2, m_sizes, device, is_down_proj=False
     ):
-        """Execute gate and up projections using grouped GEMM"""
-        # Prepare tensors and metadata for gate and up projections
+        """Execute one or two projections using grouped GEMM."""
+        # Prepare metadata for the projection(s)
+        problem_sizes, strides_abc, ptrs_abc, outputs = (
+            self._prepare_projection_metadata(
+                input_tokens, weight1, weight2, m_sizes, device, is_down_proj
+            )
+        )
+
+        if not problem_sizes:
+            return [], []
+
+        # Execute grouped GEMM
+        self._execute_grouped_gemm(problem_sizes, strides_abc, ptrs_abc, device)
+
+        return outputs if not is_down_proj else (outputs, [])
+
+    def _prepare_projection_metadata(
+        self, input_tokens, weight1, weight2, m_sizes, device, is_down_proj
+    ):
+        """Prepare metadata for projection operations."""
         problem_sizes = []
         strides_abc = []
         ptrs_abc = []
-        gate_outputs = []
-        up_outputs = []
+        outputs = []
+
+        if is_down_proj:
+            # For down projection, input_tokens is actually hidden_states list
+            return self._prepare_down_projection_metadata(
+                input_tokens, weight1, m_sizes, device
+            )
 
         offset = 0
         for expert_idx, size in enumerate(m_sizes):
             if size > 0:
-                # Get expert tokens
-                expert_tokens = contig_tokens[offset : offset + size].contiguous()
-                gate_weight = w_gate[
-                    expert_idx
-                ].contiguous()  # [intermediate_size, hidden_size]
-                up_weight = w_up[expert_idx].contiguous()
-
-                M, K = expert_tokens.shape
-                N = gate_weight.shape[0]  # intermediate_size
-                L = 1
-
-                # Create output tensors for gate and up projections
-                gate_output = torch.empty(M, N, dtype=self.dtype_torch, device=device)
-                up_output = torch.empty(M, N, dtype=self.dtype_torch, device=device)
-
-                # Convert to MNKL format (following bench_group_gemm.py)
-                expert_tokens_mnkl = expert_tokens.unsqueeze(
-                    -1
-                ).contiguous()  # (M, K, 1)
-                gate_weight_mnkl = gate_weight.unsqueeze(-1).contiguous()  # (N, K, 1)
-                up_weight_mnkl = up_weight.unsqueeze(-1).contiguous()  # (N, K, 1)
-                gate_output_mnkl = gate_output.unsqueeze(-1).contiguous()  # (M, N, 1)
-                up_output_mnkl = up_output.unsqueeze(-1).contiguous()  # (M, N, 1)
-
-                # Extract 2D strides from MNKL tensors (following bench_group_gemm.py)
-                expert_tokens_strides = expert_tokens_mnkl.stride()[:2]
-                gate_weight_strides = gate_weight_mnkl.stride()[:2]
-                up_weight_strides = up_weight_mnkl.stride()[:2]
-                gate_output_strides = gate_output_mnkl.stride()[:2]
-                up_output_strides = up_output_mnkl.stride()[:2]
-
-                # Gate projection metadata
-                problem_sizes.append([M, N, K, L])
-                strides_abc.append(
-                    [
-                        list(expert_tokens_strides),  # A strides
-                        list(gate_weight_strides),  # B strides
-                        list(gate_output_strides),  # C strides
-                    ]
-                )
-                ptrs_abc.append(
-                    [
-                        expert_tokens.data_ptr(),
-                        gate_weight.data_ptr(),
-                        gate_output.data_ptr(),
-                    ]
+                # Get expert data
+                expert_tokens = input_tokens[offset : offset + size].contiguous()
+                weight1_expert = weight1[expert_idx].contiguous()
+                weight2_expert = (
+                    weight2[expert_idx].contiguous() if weight2 is not None else None
                 )
 
-                # Up projection metadata
-                problem_sizes.append([M, N, K, L])
-                strides_abc.append(
-                    [
-                        list(expert_tokens_strides),  # A strides
-                        list(up_weight_strides),  # B strides
-                        list(up_output_strides),  # C strides
-                    ]
+                # Create outputs and add to metadata
+                output1 = self._create_output_tensor(
+                    expert_tokens.shape[0], weight1_expert.shape[0], device
                 )
-                ptrs_abc.append(
-                    [
-                        expert_tokens.data_ptr(),
-                        up_weight.data_ptr(),
-                        up_output.data_ptr(),
-                    ]
+                outputs.append(output1)
+
+                self._add_projection_to_metadata(
+                    expert_tokens,
+                    weight1_expert,
+                    output1,
+                    problem_sizes,
+                    strides_abc,
+                    ptrs_abc,
                 )
 
-                gate_outputs.append(gate_output)
-                up_outputs.append(up_output)
+                if weight2_expert is not None:
+                    output2 = self._create_output_tensor(
+                        expert_tokens.shape[0], weight2_expert.shape[0], device
+                    )
+                    outputs.append(output2)
+
+                    self._add_projection_to_metadata(
+                        expert_tokens,
+                        weight2_expert,
+                        output2,
+                        problem_sizes,
+                        strides_abc,
+                        ptrs_abc,
+                    )
 
             offset += size
 
-        if not problem_sizes:
-            return [], []
-
-        # Execute grouped GEMM for gate and up projections
-        self._execute_grouped_gemm(problem_sizes, strides_abc, ptrs_abc, device)
-
-        return gate_outputs, up_outputs
+        return (
+            problem_sizes,
+            strides_abc,
+            ptrs_abc,
+            self._split_gate_up_outputs(outputs),
+        )
 
-    def _execute_down_projection(self, hidden_states, w_down, m_sizes, device):
-        """Execute down projection using grouped GEMM"""
+    def _prepare_down_projection_metadata(
+        self, hidden_states, down_weights, m_sizes, device
+    ):
+        """Prepare metadata specifically for down projection."""
         problem_sizes = []
         strides_abc = []
         ptrs_abc = []
-        down_outputs = []
+        outputs = []
 
         expert_idx = 0
         for size in m_sizes:
             if size > 0 and expert_idx < len(hidden_states):
                 hidden = hidden_states[expert_idx]
-                down_weight = w_down[
-                    expert_idx
-                ].contiguous()  # [hidden_size, intermediate_size]
-
-                M, K = hidden.shape
-                N = down_weight.shape[0]  # hidden_size
-                L = 1
-
-                # Create output tensor
-                down_output = torch.empty(M, N, dtype=self.dtype_torch, device=device)
-
-                # Convert to MNKL format (following bench_group_gemm.py)
-                hidden_mnkl = hidden.unsqueeze(-1).contiguous()  # (M, K, 1)
-                down_weight_mnkl = down_weight.unsqueeze(-1).contiguous()  # (N, K, 1)
-                down_output_mnkl = down_output.unsqueeze(-1).contiguous()  # (M, N, 1)
-
-                # Extract 2D strides from MNKL tensors (following bench_group_gemm.py)
-                hidden_strides = hidden_mnkl.stride()[:2]
-                down_weight_strides = down_weight_mnkl.stride()[:2]
-                down_output_strides = down_output_mnkl.stride()[:2]
-
-                # Down projection metadata
-                problem_sizes.append([M, N, K, L])
-                strides_abc.append(
-                    [
-                        list(hidden_strides),  # A strides
-                        list(down_weight_strides),  # B strides
-                        list(down_output_strides),  # C strides
-                    ]
+                down_weight = down_weights[expert_idx].contiguous()
+
+                output = self._create_output_tensor(
+                    hidden.shape[0], down_weight.shape[0], device
                 )
-                ptrs_abc.append(
-                    [hidden.data_ptr(), down_weight.data_ptr(), down_output.data_ptr()]
+                outputs.append(output)
+
+                self._add_projection_to_metadata(
+                    hidden, down_weight, output, problem_sizes, strides_abc, ptrs_abc
                 )
 
-                down_outputs.append(down_output)
                 expert_idx += 1
 
-        if not problem_sizes:
-            return []
+        return problem_sizes, strides_abc, ptrs_abc, outputs
 
-        # Execute grouped GEMM for down projection
-        self._execute_grouped_gemm(problem_sizes, strides_abc, ptrs_abc, device)
+    def _create_output_tensor(self, m_size, n_size, device):
+        """Create an output tensor with the specified dimensions."""
+        return torch.empty(m_size, n_size, dtype=self.DTYPE_TORCH, device=device)
 
-        return down_outputs
+    def _add_projection_to_metadata(
+        self,
+        input_tensor,
+        weight_tensor,
+        output_tensor,
+        problem_sizes,
+        strides_abc,
+        ptrs_abc,
+    ):
+        """Add a single projection to the metadata lists."""
+        M, K = input_tensor.shape
+        N = weight_tensor.shape[0]
+        L = 1
+
+        # Convert to MNKL format
+        input_mnkl = input_tensor.unsqueeze(-1).contiguous()
+        weight_mnkl = weight_tensor.unsqueeze(-1).contiguous()
+        output_mnkl = output_tensor.unsqueeze(-1).contiguous()
+
+        # Extract strides
+        input_strides = list(input_mnkl.stride()[:2])
+        weight_strides = list(weight_mnkl.stride()[:2])
+        output_strides = list(output_mnkl.stride()[:2])
+
+        # Add to metadata
+        problem_sizes.append([M, N, K, L])
+        strides_abc.append([input_strides, weight_strides, output_strides])
+        ptrs_abc.append(
+            [
+                input_tensor.data_ptr(),
+                weight_tensor.data_ptr(),
+                output_tensor.data_ptr(),
+            ]
+        )
+
+    def _split_gate_up_outputs(self, outputs):
+        """Split combined gate/up outputs into separate lists."""
+        if not outputs:
+            return [], []
+
+        # Outputs are interleaved: [gate0, up0, gate1, up1, ...]
+        gate_outputs = outputs[::2]  # Even indices
+        up_outputs = outputs[1::2]  # Odd indices
+        return gate_outputs, up_outputs
 
     def _execute_grouped_gemm(self, problem_sizes, strides_abc, ptrs_abc, device):
-        """Execute the grouped GEMM kernel"""
+        """Execute the grouped GEMM kernel."""
         num_groups = len(problem_sizes)
 
-        # Convert to tensors
+        # Convert to CUTE tensors
+        problem_sizes_cute, strides_cute, ptrs_cute = self._convert_to_cute_tensors(
+            problem_sizes, strides_abc, ptrs_abc, device
+        )
+
+        # Get tensormap and compute clusters
+        tensormap_cute = self._get_tensormap_buffer(device)
+        total_clusters = self._compute_total_clusters(problem_sizes)
+
+        # Get initial tensors for compilation
+        initial_tensors = self._create_initial_tensors(problem_sizes[0], device)
+
+        # Compile or retrieve kernel
+        compiled_kernel = self._get_compiled_kernel(
+            num_groups,
+            total_clusters,
+            initial_tensors,
+            problem_sizes_cute,
+            strides_cute,
+            ptrs_cute,
+            tensormap_cute,
+        )
+
+        # Execute kernel
+        compiled_kernel(
+            *initial_tensors,
+            problem_sizes_cute,
+            strides_cute,
+            ptrs_cute,
+            tensormap_cute,
+            self.stream,
+        )
+        torch.cuda.synchronize()
+
+    def _convert_to_cute_tensors(self, problem_sizes, strides_abc, ptrs_abc, device):
+        """Convert metadata to CUTE tensors."""
         problem_sizes_tensor = torch.tensor(
             problem_sizes, dtype=torch.int32, device=device
         )
         strides_tensor = torch.tensor(strides_abc, dtype=torch.int32, device=device)
         ptrs_tensor = torch.tensor(ptrs_abc, dtype=torch.int64, device=device)
 
-        # Convert to CUTE tensors
-        problem_sizes_cute = from_dlpack(
-            problem_sizes_tensor, assumed_align=self.alignment
+        return (
+            from_dlpack(problem_sizes_tensor, assumed_align=self.ALIGNMENT),
+            from_dlpack(strides_tensor, assumed_align=self.ALIGNMENT),
+            from_dlpack(ptrs_tensor, assumed_align=self.ALIGNMENT),
         )
-        strides_cute = from_dlpack(strides_tensor, assumed_align=self.alignment)
-        ptrs_cute = from_dlpack(ptrs_tensor, assumed_align=self.alignment)
-
-        # Setup tensormap buffer
-        tensormap_cute = self._get_tensormap_buffer(device)
-
-        # Compute total clusters
-        total_clusters = self._compute_total_clusters(problem_sizes)
 
-        # Create initial tensors for kernel compilation (use first problem for shapes)
-        initial_A, initial_B, initial_C = self._create_initial_tensors(
-            problem_sizes[0], device
+    def _get_compiled_kernel(
+        self,
+        num_groups,
+        total_clusters,
+        initial_tensors,
+        problem_sizes_cute,
+        strides_cute,
+        ptrs_cute,
+        tensormap_cute,
+    ):
+        """Get or compile the grouped GEMM kernel."""
+        cache_key = (
+            num_groups,
+            total_clusters,
+            self.use_2cta_instrs,
+            self.mma_tiler_mn,
+            self.cluster_shape_mn,
         )
 
-        # Get or compile kernel
-        cache_key = (num_groups, total_clusters)
         if cache_key not in self._compiled_kernels:
-            print(f"Compiling grouped GEMM kernel for {num_groups} groups")
+            print(
+                f"Compiling CUTLASS grouped GEMM kernel: {num_groups} groups, 2CTA={self.use_2cta_instrs}, cluster={self.cluster_shape_mn}"
+            )
+
             self._compiled_kernels[cache_key] = cute.compile(
                 self.grouped_gemm,
-                initial_A,
-                initial_B,
-                initial_C,
+                *initial_tensors,
                 num_groups,
                 problem_sizes_cute,
                 strides_cute,
@@ -1020,77 +670,56 @@ def _execute_grouped_gemm(self, problem_sizes, strides_abc, ptrs_abc, device):
             )
             print("Kernel compilation successful")
 
-        # Execute kernel
-        compiled_kernel = self._compiled_kernels[cache_key]
-        compiled_kernel(
-            initial_A,
-            initial_B,
-            initial_C,
-            problem_sizes_cute,
-            strides_cute,
-            ptrs_cute,
-            tensormap_cute,
-            self.stream,
-        )
-
-        # Synchronize to ensure completion
-        torch.cuda.synchronize()
+        return self._compiled_kernels[cache_key]
 
     def _create_initial_tensors(self, problem_shape, device):
-        """Create initial CUTE tensors for kernel compilation"""
+        """Create initial CUTE tensors for kernel compilation."""
         M, N, K, L = problem_shape
 
-        # Create tensors with the right shapes
-        # A: tokens [M, K], B: weights [N, K], C: output [M, N]
-        A_init = torch.randn(M, K, dtype=self.dtype_torch, device=device)
-        B_init = torch.randn(
-            N, K, dtype=self.dtype_torch, device=device
-        )  # Already (N, K) format
-        C_init = torch.zeros(M, N, dtype=self.dtype_torch, device=device)
-
-        # Convert to MNKL format
-        A_mnkl = A_init.unsqueeze(-1).contiguous()  # (M, K) -> (M, K, 1)
-        B_mnkl = B_init.unsqueeze(
-            -1
-        ).contiguous()  # (N, K) -> (N, K, 1) - no transpose needed
-        C_mnkl = C_init.unsqueeze(-1).contiguous()  # (M, N) -> (M, N, 1)
-
-        # Create CUTE tensors
-        A_cute = from_dlpack(A_mnkl, assumed_align=self.alignment)
-        B_cute = from_dlpack(B_mnkl, assumed_align=self.alignment)
-        C_cute = from_dlpack(C_mnkl, assumed_align=self.alignment)
-
-        # Set CUTLASS data types
-        A_cute.element_type = self.dtype_cutlass
-        B_cute.element_type = self.dtype_cutlass
-        C_cute.element_type = self.dtype_cutlass
+        # Create tensors
+        tensors = [
+            torch.randn(M, K, dtype=self.DTYPE_TORCH, device=device),  # A
+            torch.randn(N, K, dtype=self.DTYPE_TORCH, device=device),  # B
+            torch.zeros(M, N, dtype=self.DTYPE_TORCH, device=device),  # C
+        ]
 
-        # Mark layouts as dynamic
-        A_cute = A_cute.mark_layout_dynamic(leading_dim=1)
-        B_cute = B_cute.mark_layout_dynamic(leading_dim=1)
-        C_cute = C_cute.mark_layout_dynamic(leading_dim=1)
+        # Convert to MNKL format and create CUTE tensors
+        cute_tensors = []
+        for tensor in tensors:
+            mnkl_tensor = tensor.unsqueeze(-1).contiguous()
+            cute_tensor = from_dlpack(mnkl_tensor, assumed_align=self.ALIGNMENT)
+            cute_tensor.element_type = self.DTYPE_CUTLASS
+            cute_tensor = cute_tensor.mark_layout_dynamic(leading_dim=1)
+            cute_tensors.append(cute_tensor)
 
-        return A_cute, B_cute, C_cute
+        return cute_tensors
 
     def _get_tensormap_buffer(self, device):
-        """Get or create tensormap buffer"""
+        """Get or create tensormap buffer."""
         if device not in self._tensormap_buffers:
             sm_count = self.hardware_info.get_max_active_clusters(1)
             tensormap_tensor = torch.zeros(
-                (sm_count, 3, 128 // 8),  # 3 tensormaps (A, B, C), 128 bytes each
+                (sm_count, self.TENSORMAP_COUNT, self.TENSORMAP_BYTES // 8),
                 dtype=torch.int64,
                 device=device,
             )
             self._tensormap_buffers[device] = from_dlpack(
-                tensormap_tensor, assumed_align=self.alignment
+                tensormap_tensor, assumed_align=self.ALIGNMENT
             )
 
         return self._tensormap_buffers[device]
 
     def _compute_total_clusters(self, problem_sizes):
-        """Compute total number of clusters needed"""
-        cluster_tile_m = 128  # From mma_tiler_mn[0]
-        cluster_tile_n = 64  # From mma_tiler_mn[1]
+        """Compute total number of clusters needed."""
+        cluster_tile_m = self.mma_tiler_mn[0]
+        cluster_tile_n = self.mma_tiler_mn[1]
+
+        # Adjust for 2 CTA mode and cluster shape
+        if self.use_2cta_instrs:
+            cluster_tile_m //= 2
+
+        cluster_tile_m *= self.cluster_shape_mn[0]
+        cluster_tile_n *= self.cluster_shape_mn[1]
 
         total = 0
         for M, N, K, L in problem_sizes:
@@ -1100,20 +729,15 @@ def _compute_total_clusters(self, problem_sizes):
 
         return total
 
-    def _apply_activation_and_combine(self, gate_outputs, up_outputs, m_sizes):
-        """Apply activation and combine gate/up outputs"""
-        hidden_states = []
-
-        for gate_out, up_out in zip(gate_outputs, up_outputs):
-            # Apply activation to gate output and multiply with up output
-            activated_gate = self.activation_function(gate_out)
-            combined = activated_gate * up_out
-            hidden_states.append(combined)
-
-        return hidden_states
+    def _apply_activation_and_combine(self, gate_outputs, up_outputs):
+        """Apply activation and combine gate/up outputs."""
+        return [
+            self.activation_function(gate_out) * up_out
+            for gate_out, up_out in zip(gate_outputs, up_outputs)
+        ]
 
     def _reconstruct_output(self, final_outputs, m_sizes, output):
-        """Reconstruct the full output tensor from expert results"""
+        """Reconstruct the full output tensor from expert results."""
         offset = 0
         expert_idx = 0
 
@@ -1127,7 +751,10 @@ def _reconstruct_output(self, final_outputs, m_sizes, output):
 
     @staticmethod
     def is_available() -> bool:
-        return CUTLASS_AVAILABLE
+        return HAS_CUTLASS
+
+
+# ========================= end of CUTLASSGroupedGemmStrategy =========================
 
 
 class TritonCGBF16GroupGEMM(GroupGEMMStrategy):
diff --git a/torchtitan/models/llama3/train_configs/debug_model.toml b/torchtitan/models/llama3/train_configs/debug_model.toml
index 4d4426d0d..c063d09f2 100644
--- a/torchtitan/models/llama3/train_configs/debug_model.toml
+++ b/torchtitan/models/llama3/train_configs/debug_model.toml
@@ -43,8 +43,10 @@ local_batch_size = 8
 seq_len = 2048
 max_norm = 1.0  # grad norm clipping
 steps = 10
-compile = false
+compile = true
 dataset = "c4_test"  # supported datasets: c4_test (2K), c4 (177M)
+seed = 2020
+deterministic = true
 
 [parallelism]
 data_parallel_replicate_degree = 1

From 2da79a86e89feae547a7e94bd193e17ec913fc0c Mon Sep 17 00:00:00 2001
From: lessw2020 <lessw@etrillium.com>
Date: Wed, 11 Jun 2025 15:50:13 -0700
Subject: [PATCH 09/34] all cluster sizes running nicely

---
 .../experiments/deepseek_v3/group_gemms.py    | 26 ++++++++++---------
 .../experiments/deepseek_v3/test_moe.py       |  4 +--
 .../llama3/train_configs/debug_model.toml     |  4 +--
 3 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/torchtitan/experiments/deepseek_v3/group_gemms.py b/torchtitan/experiments/deepseek_v3/group_gemms.py
index 3226964a7..536557d10 100644
--- a/torchtitan/experiments/deepseek_v3/group_gemms.py
+++ b/torchtitan/experiments/deepseek_v3/group_gemms.py
@@ -225,9 +225,9 @@ class CUTLASSGroupedGemmStrategy(GroupGEMMStrategy):
         (4, 4),
     ]
 
-    SINGLE_CTA_M_SIZES = [64, 128]
-    DUAL_CTA_M_SIZES = [128, 256]
-    N_SIZE_RANGE = range(32, 257, 32)
+    SINGLE_CTA_M_SIZES = [128, 64]
+    DUAL_CTA_M_SIZES = [256, 128]
+    N_SIZE_RANGE = range(32, 257, 32)  # 32 - 256, step 32
 
     DTYPE_TORCH = torch.bfloat16
     DTYPE_CUTLASS = cutlass.BFloat16
@@ -239,16 +239,16 @@ class CUTLASSGroupedGemmStrategy(GroupGEMMStrategy):
     def __init__(
         self,
         custom_activation,
-        use_2cta_instrs=False,
-        mma_tiler_mn=None,
-        cluster_shape_mn=None,
+        use_2cta_instrs=True,
+        mma_tiler_mn=(256, 128),
+        cluster_shape_mn=(2, 2),
     ):
         """
         Initialize the CUTLASS grouped GEMM strategy for Blackwell architecture.
 
         Args:
             custom_activation: The activation function to use
-            use_2cta_instrs (bool): Whether to use 2 CTA instructions for enhanced performance
+            use_2cta_instrs (bool): Whether to use 2 CTA instructions
             mma_tiler_mn (tuple, optional): MMA tile shape (M, N). If None, uses Blackwell-optimized defaults
             cluster_shape_mn (tuple, optional): Cluster shape (M, N). If None, uses optimized defaults
         """
@@ -260,7 +260,7 @@ def __init__(
         self.cluster_shape_mn = cluster_shape_mn or self._get_default_cluster_shape()
 
         # Validate configurations
-        self._validate_configurations()
+        # self._validate_configurations()
 
         # Initialize kernel and hardware info
         self._initialize_kernel()
@@ -270,11 +270,11 @@ def __init__(
         self._compiled_kernels = {}
         self._tensormap_buffers = {}
 
-        self._log_initialization()
+        # self._log_initialization()
 
     def _get_default_mma_tiler(self):
         """Get default MMA tiler configuration based on CTA mode."""
-        return (256, 128) if self.use_2cta_instrs else (128, 128)
+        return (256, 128) if self.use_2cta_instrs else (128, 64)
 
     def _get_default_cluster_shape(self):
         """Get default cluster shape based on CTA mode."""
@@ -345,7 +345,9 @@ def _validate_2cta_constraints(self):
             )
 
     def _log_initialization(self):
-        """Log initialization information."""
+        """Log initialization information.
+        I'm using print instead of logger b/c of cross talk from the cute dsl compiler logging
+        """
         cluster_size = self.cluster_shape_mn[0] * self.cluster_shape_mn[1]
         print(f"Initialized CUTLASSGroupedGemmStrategy for Blackwell with:")
         print(f"  - 2 CTA instructions: {self.use_2cta_instrs}")
@@ -353,7 +355,7 @@ def _log_initialization(self):
         print(f"  - Cluster shape (M, N): {self.cluster_shape_mn}")
         print(f"  - Cluster size: {cluster_size}")
         if cluster_size > 1:
-            print(f"  - Using multi-cluster parallelism for enhanced performance")
+            print(f"  - Using multi-CTA cluster parallelism ")
 
     def arrange_expert_weights(self, all_weights, submod_name, module):
         """Store weights in stacked format."""
diff --git a/torchtitan/experiments/deepseek_v3/test_moe.py b/torchtitan/experiments/deepseek_v3/test_moe.py
index 5a3ed3dcd..628eff706 100644
--- a/torchtitan/experiments/deepseek_v3/test_moe.py
+++ b/torchtitan/experiments/deepseek_v3/test_moe.py
@@ -805,11 +805,11 @@ def main():
             )
         elif arch_info["is_hopper"]:
             print(
-                f"✅ PyTorch grouped_mm available on Hopper (compute capability {arch_info['compute_capability']})"
+                f" PyTorch grouped_mm available on Hopper (compute capability {arch_info['compute_capability']})"
             )
 
     if benchmark.timing_method == "triton":
-        print("🎯 Using Triton do_bench for most accurate GPU timing")
+        print("Using Triton do_bench for GPU timing")
     elif benchmark.timing_method == "cuda_events":
         print("⏱️  Using CUDA events for accurate GPU timing")
     else:
diff --git a/torchtitan/models/llama3/train_configs/debug_model.toml b/torchtitan/models/llama3/train_configs/debug_model.toml
index c063d09f2..4d4426d0d 100644
--- a/torchtitan/models/llama3/train_configs/debug_model.toml
+++ b/torchtitan/models/llama3/train_configs/debug_model.toml
@@ -43,10 +43,8 @@ local_batch_size = 8
 seq_len = 2048
 max_norm = 1.0  # grad norm clipping
 steps = 10
-compile = true
+compile = false
 dataset = "c4_test"  # supported datasets: c4_test (2K), c4 (177M)
-seed = 2020
-deterministic = true
 
 [parallelism]
 data_parallel_replicate_degree = 1

From 565d01467d7a712ec843842dab3a920986614c18 Mon Sep 17 00:00:00 2001
From: lessw2020 <lessw@etrillium.com>
Date: Thu, 12 Jun 2025 12:52:46 -0700
Subject: [PATCH 10/34] minimize cpu-gpu synchs

---
 .../experiments/deepseek_v3/group_gemms.py    | 363 +++++++++++-------
 1 file changed, 214 insertions(+), 149 deletions(-)

diff --git a/torchtitan/experiments/deepseek_v3/group_gemms.py b/torchtitan/experiments/deepseek_v3/group_gemms.py
index 536557d10..6c76317b6 100644
--- a/torchtitan/experiments/deepseek_v3/group_gemms.py
+++ b/torchtitan/experiments/deepseek_v3/group_gemms.py
@@ -191,28 +191,10 @@ class CUTLASSGroupedGemmStrategy(GroupGEMMStrategy):
     """
     Strategy using CUTLASS GroupedGemmKernel for group GEMM operations on Blackwell architecture.
 
-    Supports both single and dual CTA instruction modes with flexible cluster configurations.
-
-    Supported cluster shapes: (1,1), (1,2), (1,4), (2,1), (2,2), (2,4), (4,1), (4,2), (4,4)
-
-    Usage Examples:
-
-    # Basic single CTA mode
-    strategy = CUTLASSGroupedGemmStrategy(custom_activation)
-
-    # 2 CTA mode with default settings
-    strategy = CUTLASSGroupedGemmStrategy(custom_activation, use_2cta_instrs=True)
-
-    # High-performance configuration with large cluster
-    strategy = CUTLASSGroupedGemmStrategy(
-        custom_activation,
-        use_2cta_instrs=True,
-        mma_tiler_mn=(256, 128),
-        cluster_shape_mn=(4, 4)
-    )
+    This version eliminates CPU-GPU synchronization by keeping all size/offset computations on GPU.
     """
 
-    # Constants
+    # Constants (same as before)
     SUPPORTED_CLUSTER_SHAPES = [
         (1, 1),
         (1, 2),
@@ -241,17 +223,9 @@ def __init__(
         custom_activation,
         use_2cta_instrs=True,
         mma_tiler_mn=(256, 128),
-        cluster_shape_mn=(2, 2),
+        cluster_shape_mn=(4, 4),
     ):
-        """
-        Initialize the CUTLASS grouped GEMM strategy for Blackwell architecture.
-
-        Args:
-            custom_activation: The activation function to use
-            use_2cta_instrs (bool): Whether to use 2 CTA instructions
-            mma_tiler_mn (tuple, optional): MMA tile shape (M, N). If None, uses Blackwell-optimized defaults
-            cluster_shape_mn (tuple, optional): Cluster shape (M, N). If None, uses optimized defaults
-        """
+        """Initialize the CUTLASS grouped GEMM strategy for Blackwell architecture."""
         super().__init__(custom_activation)
         self.use_2cta_instrs = use_2cta_instrs
 
@@ -270,11 +244,11 @@ def __init__(
         self._compiled_kernels = {}
         self._tensormap_buffers = {}
 
-        # self._log_initialization()
+        self._log_initialization()
 
     def _get_default_mma_tiler(self):
         """Get default MMA tiler configuration based on CTA mode."""
-        return (256, 128) if self.use_2cta_instrs else (128, 64)
+        return (256, 128) if self.use_2cta_instrs else (128, 128)
 
     def _get_default_cluster_shape(self):
         """Get default cluster shape based on CTA mode."""
@@ -301,7 +275,7 @@ def _initialize_hardware(self):
         self.stream = cuda.CUstream(torch_stream.cuda_stream)
 
     def _validate_configurations(self):
-        """Validate that the configurations are compatible with Blackwell and the selected mode."""
+        """Validate configurations for Blackwell."""
         self._validate_mma_tiler()
         self._validate_cluster_shape()
         self._validate_2cta_constraints()
@@ -345,9 +319,7 @@ def _validate_2cta_constraints(self):
             )
 
     def _log_initialization(self):
-        """Log initialization information.
-        I'm using print instead of logger b/c of cross talk from the cute dsl compiler logging
-        """
+        """Log initialization information."""
         cluster_size = self.cluster_shape_mn[0] * self.cluster_shape_mn[1]
         print(f"Initialized CUTLASSGroupedGemmStrategy for Blackwell with:")
         print(f"  - 2 CTA instructions: {self.use_2cta_instrs}")
@@ -355,18 +327,31 @@ def _log_initialization(self):
         print(f"  - Cluster shape (M, N): {self.cluster_shape_mn}")
         print(f"  - Cluster size: {cluster_size}")
         if cluster_size > 1:
-            print(f"  - Using multi-CTA cluster parallelism ")
+            print(f"  - Using multi-CTA parallelism")
 
     def arrange_expert_weights(self, all_weights, submod_name, module):
         """Store weights in stacked format."""
         return torch.stack(all_weights)
 
     def execute(self, contig_tokens, m_sizes, m_offsets, module):
-        """Execute using CUTLASS grouped GEMM kernel."""
+        """
+        Execute using CUTLASS grouped GEMM kernel - GPU-only version.
+
+        Args:
+            contig_tokens: Input tokens arranged contiguously by expert
+            m_sizes: Tensor of expert sizes (GPU tensor to avoid sync)
+            m_offsets: Tensor of expert offsets (GPU tensor to avoid sync)
+            module: MoE module containing weights
+        """
+        # Convert to GPU tensors if needed (avoid CPU-GPU sync)
+        m_sizes_gpu, m_offsets_gpu = self._ensure_gpu_tensors(
+            m_sizes, m_offsets, contig_tokens.device
+        )
+
         # Validate inputs
-        self._validate_inputs(contig_tokens, m_sizes, module)
+        # self._validate_inputs(contig_tokens, m_sizes_gpu, module)
 
-        # Get weights
+        # Get weights and device
         weights = self._get_weights(module)
         device = contig_tokens.device
 
@@ -378,26 +363,51 @@ def execute(self, contig_tokens, m_sizes, m_offsets, module):
             device=device,
         )
 
-        # Check for valid experts
-        if not any(size > 0 for size in m_sizes):
+        # Check for valid experts using GPU operations (no sync)
+        if not self._has_valid_experts_gpu(m_sizes_gpu):
             return output
 
-        # Execute the three-stage computation
-        gate_outputs, up_outputs = self._execute_projections(
-            contig_tokens, weights["gate"], weights["up"], m_sizes, device
+        # Execute the three-stage computation using GPU-only operations
+        gate_outputs, up_outputs = self._execute_projections_gpu(
+            contig_tokens,
+            weights["gate"],
+            weights["up"],
+            m_sizes_gpu,
+            m_offsets_gpu,
+            device,
         )
 
         hidden_states = self._apply_activation_and_combine(gate_outputs, up_outputs)
 
-        final_outputs = self._execute_projections(
-            hidden_states, weights["down"], None, m_sizes, device, is_down_proj=True
-        )[
-            0
-        ]  # Only return first element for down projection
+        final_outputs = self._execute_down_projection_gpu(
+            hidden_states, weights["down"], m_sizes_gpu, device
+        )
+
+        return self._reconstruct_output_gpu(
+            final_outputs, m_sizes_gpu, m_offsets_gpu, output
+        )
+
+    def _ensure_gpu_tensors(self, m_sizes, m_offsets, device):
+        """Ensure sizes and offsets are GPU tensors to avoid CPU-GPU sync."""
+        if not isinstance(m_sizes, torch.Tensor):
+            m_sizes_gpu = torch.tensor(m_sizes, dtype=torch.int32, device=device)
+        else:
+            m_sizes_gpu = m_sizes.to(device=device, dtype=torch.int32)
+
+        if not isinstance(m_offsets, torch.Tensor):
+            m_offsets_gpu = torch.tensor(m_offsets, dtype=torch.int32, device=device)
+        else:
+            m_offsets_gpu = m_offsets.to(device=device, dtype=torch.int32)
 
-        return self._reconstruct_output(final_outputs, m_sizes, output)
+        return m_sizes_gpu, m_offsets_gpu
 
-    def _validate_inputs(self, contig_tokens, m_sizes, module):
+    def _has_valid_experts_gpu(self, m_sizes_gpu):
+        """Check if any experts have tokens using GPU operations (no sync)."""
+        return torch.any(
+            m_sizes_gpu > 0
+        ).item()  # Single sync here is unavoidable for control flow
+
+    def _validate_inputs(self, contig_tokens, m_sizes_gpu, module):
         """Validate input parameters."""
         if contig_tokens.dtype != self.DTYPE_TORCH:
             raise ValueError(
@@ -409,10 +419,10 @@ def _validate_inputs(self, contig_tokens, m_sizes, module):
                 f"Expected 2D input tensor, got shape {contig_tokens.shape}"
             )
 
-        # required_params = ["gate_proj_weight", "up_proj_weight", "down_proj_weight"]
-        # for param in required_params:
-        #    if not hasattr(module, param) or module.get_parameter(param) is None:
-        #        raise ValueError(f"Module missing required parameter: {param}")
+        required_params = ["gate_proj_weight", "up_proj_weight", "down_proj_weight"]
+        for param in required_params:
+            if not hasattr(module, param) or module.get_parameter(param) is None:
+                raise ValueError(f"Module missing required parameter: {param}")
 
     def _get_weights(self, module):
         """Extract and return weight tensors from module."""
@@ -422,120 +432,165 @@ def _get_weights(self, module):
             "down": module.get_parameter("down_proj_weight"),
         }
 
-    def _execute_projections(
-        self, input_tokens, weight1, weight2, m_sizes, device, is_down_proj=False
+    def _execute_projections_gpu(
+        self, input_tokens, weight1, weight2, m_sizes_gpu, m_offsets_gpu, device
     ):
-        """Execute one or two projections using grouped GEMM."""
-        # Prepare metadata for the projection(s)
-        problem_sizes, strides_abc, ptrs_abc, outputs = (
-            self._prepare_projection_metadata(
-                input_tokens, weight1, weight2, m_sizes, device, is_down_proj
+        """Execute gate and up projections using GPU-only operations."""
+        # Find valid experts using GPU operations
+        valid_mask = m_sizes_gpu > 0
+        valid_indices = torch.nonzero(valid_mask, as_tuple=True)[0]
+
+        if len(valid_indices) == 0:
+            return [], []
+
+        # Prepare metadata in batch using GPU operations
+        problem_sizes, strides_abc, ptrs_abc, gate_outputs, up_outputs = (
+            self._prepare_gate_up_metadata_gpu(
+                input_tokens,
+                weight1,
+                weight2,
+                m_sizes_gpu,
+                m_offsets_gpu,
+                valid_indices,
+                device,
             )
         )
 
-        if not problem_sizes:
+        if len(problem_sizes) == 0:
             return [], []
 
         # Execute grouped GEMM
         self._execute_grouped_gemm(problem_sizes, strides_abc, ptrs_abc, device)
 
-        return outputs if not is_down_proj else (outputs, [])
+        return gate_outputs, up_outputs
 
-    def _prepare_projection_metadata(
-        self, input_tokens, weight1, weight2, m_sizes, device, is_down_proj
+    def _prepare_gate_up_metadata_gpu(
+        self,
+        input_tokens,
+        gate_weights,
+        up_weights,
+        m_sizes_gpu,
+        m_offsets_gpu,
+        valid_indices,
+        device,
     ):
-        """Prepare metadata for projection operations."""
+        """Prepare metadata for gate and up projections"""
         problem_sizes = []
         strides_abc = []
         ptrs_abc = []
-        outputs = []
-
-        if is_down_proj:
-            # For down projection, input_tokens is actually hidden_states list
-            return self._prepare_down_projection_metadata(
-                input_tokens, weight1, m_sizes, device
+        gate_outputs = []
+        up_outputs = []
+
+        # Extract valid sizes and offsets (minimal sync - only for valid experts)
+        valid_sizes = m_sizes_gpu[valid_indices]
+        valid_offsets = (
+            m_offsets_gpu[valid_indices]
+            if len(m_offsets_gpu) > len(valid_indices)
+            else torch.cumsum(
+                torch.cat([torch.tensor([0], device=device), valid_sizes[:-1]]), dim=0
             )
+        )
 
-        offset = 0
-        for expert_idx, size in enumerate(m_sizes):
+        # Convert to Python for iteration (unavoidable in this test for metadata preparation)
+        valid_sizes_cpu = valid_sizes.cpu().tolist()
+        valid_offsets_cpu = valid_offsets.cpu().tolist()
+        valid_indices_cpu = valid_indices.cpu().tolist()
+
+        for i, (expert_idx, size, offset) in enumerate(
+            zip(valid_indices_cpu, valid_sizes_cpu, valid_offsets_cpu)
+        ):
             if size > 0:
                 # Get expert data
                 expert_tokens = input_tokens[offset : offset + size].contiguous()
-                weight1_expert = weight1[expert_idx].contiguous()
-                weight2_expert = (
-                    weight2[expert_idx].contiguous() if weight2 is not None else None
-                )
-
-                # Create outputs and add to metadata
-                output1 = self._create_output_tensor(
-                    expert_tokens.shape[0], weight1_expert.shape[0], device
-                )
-                outputs.append(output1)
-
-                self._add_projection_to_metadata(
-                    expert_tokens,
-                    weight1_expert,
-                    output1,
-                    problem_sizes,
-                    strides_abc,
-                    ptrs_abc,
-                )
-
-                if weight2_expert is not None:
-                    output2 = self._create_output_tensor(
-                        expert_tokens.shape[0], weight2_expert.shape[0], device
-                    )
-                    outputs.append(output2)
-
+                gate_weight = gate_weights[expert_idx].contiguous()
+                up_weight = up_weights[expert_idx].contiguous()
+
+                M, K = expert_tokens.shape
+                N = gate_weight.shape[0]
+                L = 1
+
+                # Create output tensors
+                gate_output = torch.empty(M, N, dtype=self.DTYPE_TORCH, device=device)
+                up_output = torch.empty(M, N, dtype=self.DTYPE_TORCH, device=device)
+
+                # Add both projections to metadata
+                for weight, output, output_list in [
+                    (gate_weight, gate_output, gate_outputs),
+                    (up_weight, up_output, up_outputs),
+                ]:
                     self._add_projection_to_metadata(
                         expert_tokens,
-                        weight2_expert,
-                        output2,
+                        weight,
+                        output,
                         problem_sizes,
                         strides_abc,
                         ptrs_abc,
                     )
+                    output_list.append(output)
 
-            offset += size
+        return problem_sizes, strides_abc, ptrs_abc, gate_outputs, up_outputs
 
-        return (
-            problem_sizes,
-            strides_abc,
-            ptrs_abc,
-            self._split_gate_up_outputs(outputs),
+    def _execute_down_projection_gpu(
+        self, hidden_states, down_weights, m_sizes_gpu, device
+    ):
+        """Execute down projection using GPU operations."""
+        if not hidden_states:
+            return []
+
+        # Find valid experts
+        valid_mask = m_sizes_gpu > 0
+        valid_indices = torch.nonzero(valid_mask, as_tuple=True)[0]
+
+        # Prepare metadata
+        problem_sizes, strides_abc, ptrs_abc, down_outputs = (
+            self._prepare_down_metadata_gpu(
+                hidden_states, down_weights, valid_indices, device
+            )
         )
 
-    def _prepare_down_projection_metadata(
-        self, hidden_states, down_weights, m_sizes, device
+        if len(problem_sizes) == 0:
+            return []
+
+        # Execute grouped GEMM
+        self._execute_grouped_gemm(problem_sizes, strides_abc, ptrs_abc, device)
+
+        return down_outputs
+
+    def _prepare_down_metadata_gpu(
+        self, hidden_states, down_weights, valid_indices, device
     ):
-        """Prepare metadata specifically for down projection."""
+        """Prepare metadata for down projection using GPU operations."""
         problem_sizes = []
         strides_abc = []
         ptrs_abc = []
-        outputs = []
+        down_outputs = []
 
-        expert_idx = 0
-        for size in m_sizes:
-            if size > 0 and expert_idx < len(hidden_states):
-                hidden = hidden_states[expert_idx]
+        # Convert indices to CPU for iteration (minimal sync)
+        valid_indices_cpu = valid_indices.cpu().tolist()
+
+        for i, expert_idx in enumerate(valid_indices_cpu):
+            if i < len(hidden_states):
+                hidden = hidden_states[i]
                 down_weight = down_weights[expert_idx].contiguous()
 
-                output = self._create_output_tensor(
-                    hidden.shape[0], down_weight.shape[0], device
-                )
-                outputs.append(output)
+                M, K = hidden.shape
+                N = down_weight.shape[0]
+
+                # Create output tensor
+                down_output = torch.empty(M, N, dtype=self.DTYPE_TORCH, device=device)
+                down_outputs.append(down_output)
 
+                # Add to metadata
                 self._add_projection_to_metadata(
-                    hidden, down_weight, output, problem_sizes, strides_abc, ptrs_abc
+                    hidden,
+                    down_weight,
+                    down_output,
+                    problem_sizes,
+                    strides_abc,
+                    ptrs_abc,
                 )
 
-                expert_idx += 1
-
-        return problem_sizes, strides_abc, ptrs_abc, outputs
-
-    def _create_output_tensor(self, m_size, n_size, device):
-        """Create an output tensor with the specified dimensions."""
-        return torch.empty(m_size, n_size, dtype=self.DTYPE_TORCH, device=device)
+        return problem_sizes, strides_abc, ptrs_abc, down_outputs
 
     def _add_projection_to_metadata(
         self,
@@ -572,16 +627,6 @@ def _add_projection_to_metadata(
             ]
         )
 
-    def _split_gate_up_outputs(self, outputs):
-        """Split combined gate/up outputs into separate lists."""
-        if not outputs:
-            return [], []
-
-        # Outputs are interleaved: [gate0, up0, gate1, up1, ...]
-        gate_outputs = outputs[::2]  # Even indices
-        up_outputs = outputs[1::2]  # Odd indices
-        return gate_outputs, up_outputs
-
     def _execute_grouped_gemm(self, problem_sizes, strides_abc, ptrs_abc, device):
         """Execute the grouped GEMM kernel."""
         num_groups = len(problem_sizes)
@@ -738,16 +783,36 @@ def _apply_activation_and_combine(self, gate_outputs, up_outputs):
             for gate_out, up_out in zip(gate_outputs, up_outputs)
         ]
 
-    def _reconstruct_output(self, final_outputs, m_sizes, output):
-        """Reconstruct the full output tensor from expert results."""
-        offset = 0
-        expert_idx = 0
+    def _reconstruct_output_gpu(
+        self, final_outputs, m_sizes_gpu, m_offsets_gpu, output
+    ):
+        """Reconstruct the full output tensor using GPU operations (minimal sync)."""
+        if not final_outputs:
+            return output
 
-        for size in m_sizes:
-            if size > 0 and expert_idx < len(final_outputs):
-                output[offset : offset + size] = final_outputs[expert_idx]
-                expert_idx += 1
-            offset += size
+        # Find valid experts
+        valid_mask = m_sizes_gpu > 0
+        valid_indices = torch.nonzero(valid_mask, as_tuple=True)[0]
+        valid_sizes = m_sizes_gpu[valid_indices]
+
+        # Compute offsets if not provided properly
+        if len(m_offsets_gpu) <= len(valid_indices):
+            valid_offsets = torch.cumsum(
+                torch.cat(
+                    [torch.tensor([0], device=m_sizes_gpu.device), valid_sizes[:-1]]
+                ),
+                dim=0,
+            )
+        else:
+            valid_offsets = m_offsets_gpu[valid_indices]
+
+        # Convert to CPU for final reconstruction (minimal sync)
+        valid_sizes_cpu = valid_sizes.cpu().tolist()
+        valid_offsets_cpu = valid_offsets.cpu().tolist()
+
+        for i, (size, offset) in enumerate(zip(valid_sizes_cpu, valid_offsets_cpu)):
+            if i < len(final_outputs):
+                output[offset : offset + size] = final_outputs[i]
 
         return output
 

From aff4a06d3e54ee40903772c82ae0f963341e511e Mon Sep 17 00:00:00 2001
From: lessw2020 <lessw@etrillium.com>
Date: Thu, 12 Jun 2025 16:05:04 -0700
Subject: [PATCH 11/34] initial backwards for cutlass (not working)

---
 .../deepseek_v3/cutlass_backwards.py          | 1477 +++++++++++++++++
 .../experiments/deepseek_v3/group_gemms.py    |    6 +-
 torchtitan/experiments/deepseek_v3/hw_info.py |   17 +
 3 files changed, 1497 insertions(+), 3 deletions(-)
 create mode 100644 torchtitan/experiments/deepseek_v3/cutlass_backwards.py
 create mode 100644 torchtitan/experiments/deepseek_v3/hw_info.py

diff --git a/torchtitan/experiments/deepseek_v3/cutlass_backwards.py b/torchtitan/experiments/deepseek_v3/cutlass_backwards.py
new file mode 100644
index 000000000..624a47d97
--- /dev/null
+++ b/torchtitan/experiments/deepseek_v3/cutlass_backwards.py
@@ -0,0 +1,1477 @@
+from typing import List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+
+
+try:
+    import cuda.bindings.driver as cuda
+    import cutlass
+    import cutlass.cute as cute
+    import cutlass.torch as cutlass_torch
+    import cutlass.utils as utils
+
+    from cutlass.cute.runtime import from_dlpack
+    from torchtitan.experiments.kernels.blackwell.cute_grouped_gemm import (
+        GroupedGemmKernel,
+    )
+
+    HAS_CUTLASS = True
+    print("✓ CUTLASS and strategies imported successfully")
+except ImportError as e:
+    HAS_CUTLASS = False
+    print(f"✗ Import failed: {e}")
+    print("Using PyTorch fallback implementations only")
+
+"""
+Notes!
+
+current error - requires a kernel context before getting hardware info.
+
+This class is used to get the hardware info of given GPU device.
+It provides methods to get the max active clusters for given cluster size.
+
+Prerequisite:
+- CUDA driver is initialized via `driver.cuInit` or other CUDA APIs.
+- CUDA context is created via `driver.cuCtxCreate` or other CUDA APIs.
+
+
+this works:
+cute hardware - device_id 0
+cute hardware - driver_version 12080
+2025-06-12 15:59:27,116 - INFO - Started preprocessing [_host_function]
+2025-06-12 15:59:27,117 - INFO - ASTPreprocessor Transforming function [_host_function]
+2025-06-12 15:59:27,118 - INFO - ASTPreprocessor Executing transformed code for function [_host_function]
+2025-06-12 15:59:27,118 - INFO - Final mangled function name: cutlass__host_function_cutlassutilshardware_infoHardwareInfo_object_at_
+2025-06-12 15:59:27,120 - INFO - Started preprocessing [_empty_kernel]
+2025-06-12 15:59:27,120 - INFO - ASTPreprocessor Transforming function [_empty_kernel]
+2025-06-12 15:59:27,121 - INFO - ASTPreprocessor Executing transformed code for function [_empty_kernel]
+2025-06-12 15:59:27,122 - INFO - Final mangled function name: cutlass__empty_kernel_cutlassutilshardware_infoHardwareInfo_object_at_
+2025-06-12 15:59:27,243 - INFO - Function=[cutlass__host_function_cutlassutilshardware_infoHardwareInfo_object_at_] Computed module_hash=[dcde861f00d587038a517012d015a7fe7d920bf8fd510b4d947c612997627913]
+2025-06-12 15:59:27,243 - INFO - JIT cache miss function=[cutlass__host_function_cutlassutilshardware_infoHardwareInfo_object_at_] module_hash=[dcde861f00d587038a517012d015a7fe7d920bf8fd510b4d947c612997627913]
+2025-06-12 15:59:27,275 - INFO - cuModuleLoadData 478334896
+2025-06-12 15:59:27,276 - INFO - cuModuleGetFunction <CUmodule 0x1c94d2d0> kernel_cutlass__empty_kernel_cutlassutilshardware_infoHardwareInfo_object_at__0
+2025-06-12 15:59:27,276 - INFO - <CUfunction 0x1c910de0> <-- cuModuleGetFunction
+max_dynamic_shared_memory: 232448
+max_active_blocks: 1
+Initialized CUTLASSGroupedGemmStrategy for Blackwell with:
+
+but basic strategy below fails:
+
+✓ CUTLASS and strategies imported successfully
+Testing CUTLASS Backward Group GEMM...
+Creating strategy for 4 experts, 1024 in_features, 2048 out_features, 512 total_tokens
+Initializing CUTLASSGroupedGemmStrategy for Blackwell
+cute hardware - device_id 0
+cute hardware - driver_version 12080
+Traceback (most recent call last):
+  File "/data/users/less/torchtitan/torchtitan/experiments/deepseek_v3/cutlass_backwards.py", line 1387, in <module>
+    test_cutlass_backward_group_gemm()
+  File "/data/users/less/torchtitan/torchtitan/experiments/deepseek_v3/cutlass_backwards.py", line 1348, in test_cutlass_backward_group_gemm
+    strategy = CUTLASSGroupedGemmStrategy(
+               ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/data/users/less/torchtitan/torchtitan/experiments/deepseek_v3/cutlass_backwards.py", line 120, in __init__
+    self._initialize_hardware()
+  File "/data/users/less/torchtitan/torchtitan/experiments/deepseek_v3/cutlass_backwards.py", line 149, in _initialize_hardware
+    self.max_active_clusters = self.hardware_info.get_max_active_clusters(
+                               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/less/.conda/envs/pycutlass/lib/python3.12/site-packages/nvidia_cutlass_dsl/python_packages/cutlass/base_dsl/jit_executor.py", line 328, in walk_module_and_get_cubin_data
+    module.operation.walk(walk_gpu_binary_op)
+RuntimeError: Exception raised in callback: Traceback (most recent call last):
+  File "/data/users/less/torchtitan/torchtitan/experiments/deepseek_v3/cutlass_backwards.py", line 1387, in <module>
+  File "/data/users/less/torchtitan/torchtitan/experiments/deepseek_v3/cutlass_backwards.py", line 1348, in test_cutlass_backward_group_gemm
+  File "/data/users/less/torchtitan/torchtitan/experiments/deepseek_v3/cutlass_backwards.py", line 120, in __init__
+  File "/data/users/less/torchtitan/torchtitan/experiments/deepseek_v3/cutlass_backwards.py", line 149, in _initialize_hardware
+  File "/home/less/.conda/envs/pycutlass/lib/python3.12/site-packages/nvidia_cutlass_dsl/python_packages/cutlass/utils/hardware_info.py", line 47, in get_max_active_clusters
+  File "/home/less/.conda/envs/pycutlass/lib/python3.12/site-packages/nvidia_cutlass_dsl/python_packages/cutlass/utils/hardware_info.py", line 176, in _get_device_function
+  File "/home/less/.conda/envs/pycutlass/lib/python3.12/site-packages/nvidia_cutlass_dsl/python_packages/cutlass/base_dsl/compiler.py", line 221, in compile
+  File "/home/less/.conda/envs/pycutlass/lib/python3.12/site-packages/nvidia_cutlass_dsl/python_packages/cutlass/base_dsl/dsl.py", line 1337, in _func
+  File "/home/less/.conda/envs/pycutlass/lib/python3.12/site-packages/nvidia_cutlass_dsl/python_packages/cutlass/base_dsl/dsl.py", line 1188, in generate_mlir
+  File "/home/less/.conda/envs/pycutlass/lib/python3.12/site-packages/nvidia_cutlass_dsl/python_packages/cutlass/base_dsl/dsl.py", line 1129, in compile_and_cache
+  File "/home/less/.conda/envs/pycutlass/lib/python3.12/site-packages/nvidia_cutlass_dsl/python_packages/cutlass/base_dsl/jit_executor.py", line 258, in update_jit_cuda_modules
+  File "/home/less/.conda/envs/pycutlass/lib/python3.12/site-packages/nvidia_cutlass_dsl/python_packages/cutlass/base_dsl/jit_executor.py", line 328, in walk_module_and_get_cubin_data
+  File "/home/less/.conda/envs/pycutlass/lib/python3.12/site-packages/nvidia_cutlass_dsl/python_packages/cutlass/base_dsl/jit_executor.py", line 325, in walk_gpu_binary_op
+  File "/home/less/.conda/envs/pycutlass/lib/python3.12/site-packages/nvidia_cutlass_dsl/python_packages/cutlass/base_dsl/jit_executor.py", line 243, in walk_callback
+  File "/home/less/.conda/envs/pycutlass/lib/python3.12/site-packages/nvidia_cutlass_dsl/python_packages/cutlass/base_dsl/runtime/cuda.py", line 294, in load_cubin_module_data
+  File "/home/less/.conda/envs/pycutlass/lib/python3.12/site-packages/nvidia_cutlass_dsl/python_packages/cutlass/base_dsl/runtime/cuda.py", line 229, in checkCudaErrors
+DSLCudaRuntimeError: DSLCudaRuntimeError: Unknown CUDA error
+Error Code: 201
+
+🔍 Additional Context:
+- Error name: CUDA_ERROR_INVALID_CONTEXT
+- CUDA_TOOLKIT_PATH: not set
+- Target SM ARCH: not set
+
+📊 GPU Information:
+- CUDA devices available: 8 (current: 0)
+- Architecture: Blackwell (sm_100a)
+- Compatible SM archs: sm_100a
+
+Compatibility Check:
+❌ Error: Target SM ARCH unknown is not compatible
+💡 Please use one of SM ARCHs: sm_100a
+
+"""
+
+
+# Strategy base class for GroupGEMM implementations
+class GroupGEMMStrategy:
+    """Base class for group gemm strategies"""
+
+    def __init__(self, custom_activation):
+        self.activation_function = custom_activation
+
+    def arrange_expert_weights(self, all_weights, submod_name, module):
+        """prepare expert weights, including prescaling
+
+        Args:
+            all_weights: List of weight tensors from each expert
+            submod_name: Name of the submodule (e.g., 'gate_proj', 'up_proj', 'down_proj')
+            module: The parent module that will store the arranged weights
+
+        Returns:
+            Tensor: The arranged weights in the format required by the specific strategy
+        """
+
+        raise NotImplementedError("Requires arrange_expert_weights method")
+
+    def execute(self, contig_tokens, m_sizes, m_offsets, module):
+        """Execute the group gemm operation
+
+        Args:
+            contig_tokens: The input tokens, arranged contiguously by expert
+            m_sizes: Sizes of each group
+            m_offsets: Offsets of each group
+            module: The MoE module containing weights and parameters
+
+        Returns:
+            The processed tokens
+        """
+        raise NotImplementedError("GroupGEMM strategy must implement execute method")
+
+    @staticmethod
+    def is_available() -> bool:
+        """Check if this strategy is available on the current system"""
+        return False
+
+
+class CUTLASSGroupedGemmStrategy(GroupGEMMStrategy):
+    """
+    Strategy using CUTLASS GroupedGemmKernel for group GEMM operations on Blackwell architecture.
+
+    This version eliminates CPU-GPU synchronization by keeping all size/offset computations on GPU.
+    """
+
+    # Constants
+    SUPPORTED_CLUSTER_SHAPES = [
+        (1, 1),
+        (1, 2),
+        (1, 4),
+        (2, 1),
+        (2, 2),
+        (2, 4),
+        (4, 1),
+        (4, 2),
+        (4, 4),
+    ]
+
+    SINGLE_CTA_M_SIZES = [128, 64]
+    DUAL_CTA_M_SIZES = [256, 128]
+    N_SIZE_RANGE = range(32, 257, 32)  # 32 - 256, step 32
+
+    DTYPE_TORCH = torch.bfloat16
+    DTYPE_CUTLASS = cutlass.BFloat16
+    ACC_DTYPE = cutlass.Float32
+    ALIGNMENT = 16
+    TENSORMAP_COUNT = 3
+    TENSORMAP_BYTES = 128
+
+    def __init__(
+        self,
+        custom_activation=nn.SiLU(),
+        use_2cta_instrs=True,
+        mma_tiler_mn=(256, 128),
+        cluster_shape_mn=(4, 4),
+    ):
+        """Initialize the CUTLASS grouped GEMM strategy for Blackwell architecture."""
+        print(f"Initializing CUTLASSGroupedGemmStrategy for Blackwell")
+        super().__init__(custom_activation)
+        self.use_2cta_instrs = use_2cta_instrs
+
+        # Set configuration
+        self.mma_tiler_mn = mma_tiler_mn or self._get_default_mma_tiler()
+        self.cluster_shape_mn = cluster_shape_mn or self._get_default_cluster_shape()
+
+        # Validate configurations
+        # self._validate_configurations()
+
+        # Initialize kernel and hardware info
+        self._initialize_kernel()
+        self._initialize_hardware()
+
+        # Initialize caches
+        self._compiled_kernels = {}
+        self._tensormap_buffers = {}
+
+        self._log_initialization()
+
+    def _get_default_mma_tiler(self):
+        """Get default MMA tiler configuration based on CTA mode."""
+        return (256, 128) if self.use_2cta_instrs else (128, 128)
+
+    def _get_default_cluster_shape(self):
+        """Get default cluster shape based on CTA mode."""
+        return (2, 2) if self.use_2cta_instrs else (1, 1)
+
+    def _initialize_kernel(self):
+        """Initialize the CUTLASS grouped GEMM kernel."""
+        self.grouped_gemm = GroupedGemmKernel(
+            acc_dtype=self.ACC_DTYPE,
+            use_2cta_instrs=self.use_2cta_instrs,
+            mma_tiler_mn=self.mma_tiler_mn,
+            cluster_shape_mn=self.cluster_shape_mn,
+            tensormap_update_mode=utils.TensorMapUpdateMode.SMEM,
+        )
+
+    def _initialize_hardware(self):
+        """Initialize hardware information and stream."""
+        self.hardware_info = utils.HardwareInfo()
+        self.max_active_clusters = self.hardware_info.get_max_active_clusters(
+            self.cluster_shape_mn[0] * self.cluster_shape_mn[1]
+        )
+
+        torch_stream = torch.cuda.current_stream()
+        self.stream = cuda.CUstream(torch_stream.cuda_stream)
+
+    def _validate_configurations(self):
+        """Validate configurations for Blackwell."""
+        self._validate_mma_tiler()
+        self._validate_cluster_shape()
+        self._validate_2cta_constraints()
+
+    def _validate_mma_tiler(self):
+        """Validate MMA tiler configuration."""
+        m_size, n_size = self.mma_tiler_mn
+
+        valid_m_sizes = (
+            self.DUAL_CTA_M_SIZES if self.use_2cta_instrs else self.SINGLE_CTA_M_SIZES
+        )
+        mode_name = "2 CTA" if self.use_2cta_instrs else "single CTA"
+
+        if m_size not in valid_m_sizes:
+            raise ValueError(
+                f"For {mode_name} mode on Blackwell, MMA tiler M must be in {valid_m_sizes}, got {m_size}"
+            )
+
+        if n_size not in self.N_SIZE_RANGE:
+            raise ValueError(
+                f"MMA tiler N must be in range [32, 256] with step 32, got {n_size}"
+            )
+
+    def _validate_cluster_shape(self):
+        """Validate cluster shape configuration."""
+        if self.cluster_shape_mn not in self.SUPPORTED_CLUSTER_SHAPES:
+            raise ValueError(
+                f"Cluster shape {self.cluster_shape_mn} not supported on Blackwell. "
+                f"Valid cluster shapes are: {self.SUPPORTED_CLUSTER_SHAPES}"
+            )
+
+    def _validate_2cta_constraints(self):
+        """Validate 2 CTA specific constraints."""
+        if self.use_2cta_instrs and self.cluster_shape_mn[0] % 2 != 0:
+            valid_2cta_shapes = [
+                shape for shape in self.SUPPORTED_CLUSTER_SHAPES if shape[0] % 2 == 0
+            ]
+            raise ValueError(
+                f"For 2 CTA mode, cluster shape M must be even, got {self.cluster_shape_mn[0]}. "
+                f"Valid 2 CTA cluster shapes: {valid_2cta_shapes}"
+            )
+
+    def _log_initialization(self):
+        """Log initialization information."""
+        cluster_size = self.cluster_shape_mn[0] * self.cluster_shape_mn[1]
+        print(f"Initialized CUTLASSGroupedGemmStrategy for Blackwell with:")
+        print(f"  - 2 CTA instructions: {self.use_2cta_instrs}")
+        print(f"  - MMA tiler (M, N): {self.mma_tiler_mn}")
+        print(f"  - Cluster shape (M, N): {self.cluster_shape_mn}")
+        print(f"  - Cluster size: {cluster_size}")
+        if cluster_size > 1:
+            print(f"  - Using multi-CTA parallelism")
+
+    def arrange_expert_weights(self, all_weights, submod_name, module):
+        """Store weights in stacked format."""
+        return torch.stack(all_weights)
+
+    def execute(self, contig_tokens, m_sizes, m_offsets, module):
+        """
+        Execute using CUTLASS grouped GEMM kernel - GPU-only version.
+
+        Args:
+            contig_tokens: Input tokens arranged contiguously by expert
+            m_sizes: Tensor of expert sizes (GPU tensor to avoid sync)
+            m_offsets: Tensor of expert offsets (GPU tensor to avoid sync)
+            module: MoE module containing weights
+        """
+        # Convert to GPU tensors if needed (avoid CPU-GPU sync)
+        m_sizes_gpu, m_offsets_gpu = self._ensure_gpu_tensors(
+            m_sizes, m_offsets, contig_tokens.device
+        )
+
+        # Validate inputs
+        # self._validate_inputs(contig_tokens, m_sizes_gpu, module)
+
+        # Get weights and device
+        weights = self._get_weights(module)
+        device = contig_tokens.device
+
+        # Prepare output tensor
+        output = torch.zeros(
+            contig_tokens.shape[0],
+            weights["gate"].shape[2],
+            dtype=self.DTYPE_TORCH,
+            device=device,
+        )
+
+        # Check for valid experts using GPU operations (no sync)
+        if not self._has_valid_experts_gpu(m_sizes_gpu):
+            return output
+
+        # Execute the three-stage computation using GPU-only operations
+        gate_outputs, up_outputs = self._execute_projections_gpu(
+            contig_tokens,
+            weights["gate"],
+            weights["up"],
+            m_sizes_gpu,
+            m_offsets_gpu,
+            device,
+        )
+
+        hidden_states = self._apply_activation_and_combine(gate_outputs, up_outputs)
+
+        final_outputs = self._execute_down_projection_gpu(
+            hidden_states, weights["down"], m_sizes_gpu, device
+        )
+
+        return self._reconstruct_output_gpu(
+            final_outputs, m_sizes_gpu, m_offsets_gpu, output
+        )
+
+    def _ensure_gpu_tensors(self, m_sizes, m_offsets, device):
+        """Ensure sizes and offsets are GPU tensors to avoid CPU-GPU sync."""
+        if not isinstance(m_sizes, torch.Tensor):
+            m_sizes_gpu = torch.tensor(m_sizes, dtype=torch.int32, device=device)
+        else:
+            m_sizes_gpu = m_sizes.to(device=device, dtype=torch.int32)
+
+        if not isinstance(m_offsets, torch.Tensor):
+            m_offsets_gpu = torch.tensor(m_offsets, dtype=torch.int32, device=device)
+        else:
+            m_offsets_gpu = m_offsets.to(device=device, dtype=torch.int32)
+
+        return m_sizes_gpu, m_offsets_gpu
+
+    def _has_valid_experts_gpu(self, m_sizes_gpu):
+        """Check if any experts have tokens using GPU operations (no sync)."""
+        return torch.any(
+            m_sizes_gpu > 0
+        ).item()  # Single sync here is unavoidable for control flow
+
+    def _validate_inputs(self, contig_tokens, m_sizes_gpu, module):
+        """Validate input parameters."""
+        if contig_tokens.dtype != self.DTYPE_TORCH:
+            raise ValueError(
+                f"Expected input dtype {self.DTYPE_TORCH}, got {contig_tokens.dtype}"
+            )
+
+        if len(contig_tokens.shape) != 2:
+            raise ValueError(
+                f"Expected 2D input tensor, got shape {contig_tokens.shape}"
+            )
+
+        required_params = ["gate_proj_weight", "up_proj_weight", "down_proj_weight"]
+        for param in required_params:
+            if not hasattr(module, param) or module.get_parameter(param) is None:
+                raise ValueError(f"Module missing required parameter: {param}")
+
+    def _get_weights(self, module):
+        """Extract and return weight tensors from module."""
+        return {
+            "gate": module.get_parameter("gate_proj_weight"),
+            "up": module.get_parameter("up_proj_weight"),
+            "down": module.get_parameter("down_proj_weight"),
+        }
+
+    def _execute_projections_gpu(
+        self, input_tokens, weight1, weight2, m_sizes_gpu, m_offsets_gpu, device
+    ):
+        """Execute gate and up projections using GPU-only operations."""
+        # Find valid experts using GPU operations
+        valid_mask = m_sizes_gpu > 0
+        valid_indices = torch.nonzero(valid_mask, as_tuple=True)[0]
+
+        if len(valid_indices) == 0:
+            return [], []
+
+        # Prepare metadata in batch using GPU operations
+        problem_sizes, strides_abc, ptrs_abc, gate_outputs, up_outputs = (
+            self._prepare_gate_up_metadata_gpu(
+                input_tokens,
+                weight1,
+                weight2,
+                m_sizes_gpu,
+                m_offsets_gpu,
+                valid_indices,
+                device,
+            )
+        )
+
+        if len(problem_sizes) == 0:
+            return [], []
+
+        # Execute grouped GEMM
+        self._execute_grouped_gemm(problem_sizes, strides_abc, ptrs_abc, device)
+
+        return gate_outputs, up_outputs
+
+    def _prepare_gate_up_metadata_gpu(
+        self,
+        input_tokens,
+        gate_weights,
+        up_weights,
+        m_sizes_gpu,
+        m_offsets_gpu,
+        valid_indices,
+        device,
+    ):
+        """Prepare metadata for gate and up projections"""
+        problem_sizes = []
+        strides_abc = []
+        ptrs_abc = []
+        gate_outputs = []
+        up_outputs = []
+
+        # Extract valid sizes and offsets (minimal sync - only for valid experts)
+        valid_sizes = m_sizes_gpu[valid_indices]
+        valid_offsets = (
+            m_offsets_gpu[valid_indices]
+            if len(m_offsets_gpu) > len(valid_indices)
+            else torch.cumsum(
+                torch.cat([torch.tensor([0], device=device), valid_sizes[:-1]]), dim=0
+            )
+        )
+
+        # Convert to Python for iteration (unavoidable in this test for metadata preparation)
+        valid_sizes_cpu = valid_sizes.cpu().tolist()
+        valid_offsets_cpu = valid_offsets.cpu().tolist()
+        valid_indices_cpu = valid_indices.cpu().tolist()
+
+        for i, (expert_idx, size, offset) in enumerate(
+            zip(valid_indices_cpu, valid_sizes_cpu, valid_offsets_cpu)
+        ):
+            if size > 0:
+                # Get expert data
+                expert_tokens = input_tokens[offset : offset + size].contiguous()
+                gate_weight = gate_weights[expert_idx].contiguous()
+                up_weight = up_weights[expert_idx].contiguous()
+
+                M, K = expert_tokens.shape
+                N = gate_weight.shape[0]
+                L = 1
+
+                # Create output tensors
+                gate_output = torch.empty(M, N, dtype=self.DTYPE_TORCH, device=device)
+                up_output = torch.empty(M, N, dtype=self.DTYPE_TORCH, device=device)
+
+                # Add both projections to metadata
+                for weight, output, output_list in [
+                    (gate_weight, gate_output, gate_outputs),
+                    (up_weight, up_output, up_outputs),
+                ]:
+                    self._add_projection_to_metadata(
+                        expert_tokens,
+                        weight,
+                        output,
+                        problem_sizes,
+                        strides_abc,
+                        ptrs_abc,
+                    )
+                    output_list.append(output)
+
+        return problem_sizes, strides_abc, ptrs_abc, gate_outputs, up_outputs
+
+    def _execute_down_projection_gpu(
+        self, hidden_states, down_weights, m_sizes_gpu, device
+    ):
+        """Execute down projection using GPU operations."""
+        if not hidden_states:
+            return []
+
+        # Find valid experts
+        valid_mask = m_sizes_gpu > 0
+        valid_indices = torch.nonzero(valid_mask, as_tuple=True)[0]
+
+        # Prepare metadata
+        problem_sizes, strides_abc, ptrs_abc, down_outputs = (
+            self._prepare_down_metadata_gpu(
+                hidden_states, down_weights, valid_indices, device
+            )
+        )
+
+        if len(problem_sizes) == 0:
+            return []
+
+        # Execute grouped GEMM
+        self._execute_grouped_gemm(problem_sizes, strides_abc, ptrs_abc, device)
+
+        return down_outputs
+
+    def _prepare_down_metadata_gpu(
+        self, hidden_states, down_weights, valid_indices, device
+    ):
+        """Prepare metadata for down projection using GPU operations."""
+        problem_sizes = []
+        strides_abc = []
+        ptrs_abc = []
+        down_outputs = []
+
+        # Convert indices to CPU for iteration (minimal sync)
+        valid_indices_cpu = valid_indices.cpu().tolist()
+
+        for i, expert_idx in enumerate(valid_indices_cpu):
+            if i < len(hidden_states):
+                hidden = hidden_states[i]
+                down_weight = down_weights[expert_idx].contiguous()
+
+                M, K = hidden.shape
+                N = down_weight.shape[0]
+
+                # Create output tensor
+                down_output = torch.empty(M, N, dtype=self.DTYPE_TORCH, device=device)
+                down_outputs.append(down_output)
+
+                # Add to metadata
+                self._add_projection_to_metadata(
+                    hidden,
+                    down_weight,
+                    down_output,
+                    problem_sizes,
+                    strides_abc,
+                    ptrs_abc,
+                )
+
+        return problem_sizes, strides_abc, ptrs_abc, down_outputs
+
+    def _add_projection_to_metadata(
+        self,
+        input_tensor,
+        weight_tensor,
+        output_tensor,
+        problem_sizes,
+        strides_abc,
+        ptrs_abc,
+    ):
+        """Add a single projection to the metadata lists."""
+        M, K = input_tensor.shape
+        N = weight_tensor.shape[0]
+        L = 1
+
+        # Convert to MNKL format
+        input_mnkl = input_tensor.unsqueeze(-1).contiguous()
+        weight_mnkl = weight_tensor.unsqueeze(-1).contiguous()
+        output_mnkl = output_tensor.unsqueeze(-1).contiguous()
+
+        # Extract strides
+        input_strides = list(input_mnkl.stride()[:2])
+        weight_strides = list(weight_mnkl.stride()[:2])
+        output_strides = list(output_mnkl.stride()[:2])
+
+        # Add to metadata
+        problem_sizes.append([M, N, K, L])
+        strides_abc.append([input_strides, weight_strides, output_strides])
+        ptrs_abc.append(
+            [
+                input_tensor.data_ptr(),
+                weight_tensor.data_ptr(),
+                output_tensor.data_ptr(),
+            ]
+        )
+
+    def _execute_grouped_gemm(self, problem_sizes, strides_abc, ptrs_abc, device):
+        """Execute the grouped GEMM kernel."""
+        num_groups = len(problem_sizes)
+
+        # Convert to CUTE tensors
+        problem_sizes_cute, strides_cute, ptrs_cute = self._convert_to_cute_tensors(
+            problem_sizes, strides_abc, ptrs_abc, device
+        )
+
+        # Get tensormap and compute clusters
+        tensormap_cute = self._get_tensormap_buffer(device)
+        total_clusters = self._compute_total_clusters(problem_sizes)
+
+        # Get initial tensors for compilation
+        initial_tensors = self._create_initial_tensors(problem_sizes[0], device)
+
+        # Compile or retrieve kernel
+        compiled_kernel = self._get_compiled_kernel(
+            num_groups,
+            total_clusters,
+            initial_tensors,
+            problem_sizes_cute,
+            strides_cute,
+            ptrs_cute,
+            tensormap_cute,
+        )
+
+        # Execute kernel
+        compiled_kernel(
+            *initial_tensors,
+            problem_sizes_cute,
+            strides_cute,
+            ptrs_cute,
+            tensormap_cute,
+            self.stream,
+        )
+        torch.cuda.synchronize()
+
+    def _convert_to_cute_tensors(self, problem_sizes, strides_abc, ptrs_abc, device):
+        """Convert metadata to CUTE tensors."""
+        problem_sizes_tensor = torch.tensor(
+            problem_sizes, dtype=torch.int32, device=device
+        )
+        strides_tensor = torch.tensor(strides_abc, dtype=torch.int32, device=device)
+        ptrs_tensor = torch.tensor(ptrs_abc, dtype=torch.int64, device=device)
+
+        return (
+            from_dlpack(problem_sizes_tensor, assumed_align=self.ALIGNMENT),
+            from_dlpack(strides_tensor, assumed_align=self.ALIGNMENT),
+            from_dlpack(ptrs_tensor, assumed_align=self.ALIGNMENT),
+        )
+
+    def _get_compiled_kernel(
+        self,
+        num_groups,
+        total_clusters,
+        initial_tensors,
+        problem_sizes_cute,
+        strides_cute,
+        ptrs_cute,
+        tensormap_cute,
+    ):
+        """Get or compile the grouped GEMM kernel."""
+        cache_key = (
+            num_groups,
+            total_clusters,
+            self.use_2cta_instrs,
+            self.mma_tiler_mn,
+            self.cluster_shape_mn,
+        )
+
+        if cache_key not in self._compiled_kernels:
+            print(
+                f"Compiling CUTLASS grouped GEMM kernel: {num_groups} groups, 2CTA={self.use_2cta_instrs}, cluster={self.cluster_shape_mn}"
+            )
+
+            self._compiled_kernels[cache_key] = cute.compile(
+                self.grouped_gemm,
+                *initial_tensors,
+                num_groups,
+                problem_sizes_cute,
+                strides_cute,
+                ptrs_cute,
+                total_clusters,
+                tensormap_cute,
+                self.max_active_clusters,
+                self.stream,
+            )
+            print("Kernel compilation successful")
+
+        return self._compiled_kernels[cache_key]
+
+    def _create_initial_tensors(self, problem_shape, device):
+        """Create initial CUTE tensors for kernel compilation."""
+        M, N, K, L = problem_shape
+
+        # Create tensors
+        tensors = [
+            torch.randn(M, K, dtype=self.DTYPE_TORCH, device=device),  # A
+            torch.randn(N, K, dtype=self.DTYPE_TORCH, device=device),  # B
+            torch.zeros(M, N, dtype=self.DTYPE_TORCH, device=device),  # C
+        ]
+
+        # Convert to MNKL format and create CUTE tensors
+        cute_tensors = []
+        for tensor in tensors:
+            mnkl_tensor = tensor.unsqueeze(-1).contiguous()
+            cute_tensor = from_dlpack(mnkl_tensor, assumed_align=self.ALIGNMENT)
+            cute_tensor.element_type = self.DTYPE_CUTLASS
+            cute_tensor = cute_tensor.mark_layout_dynamic(leading_dim=1)
+            cute_tensors.append(cute_tensor)
+
+        return cute_tensors
+
+    def _get_tensormap_buffer(self, device):
+        """Get or create tensormap buffer."""
+        if device not in self._tensormap_buffers:
+            sm_count = self.hardware_info.get_max_active_clusters(1)
+            tensormap_tensor = torch.zeros(
+                (sm_count, self.TENSORMAP_COUNT, self.TENSORMAP_BYTES // 8),
+                dtype=torch.int64,
+                device=device,
+            )
+            self._tensormap_buffers[device] = from_dlpack(
+                tensormap_tensor, assumed_align=self.ALIGNMENT
+            )
+
+        return self._tensormap_buffers[device]
+
+    def _compute_total_clusters(self, problem_sizes):
+        """Compute total number of clusters needed."""
+        cluster_tile_m = self.mma_tiler_mn[0]
+        cluster_tile_n = self.mma_tiler_mn[1]
+
+        # Adjust for 2 CTA mode and cluster shape
+        if self.use_2cta_instrs:
+            cluster_tile_m //= 2
+
+        cluster_tile_m *= self.cluster_shape_mn[0]
+        cluster_tile_n *= self.cluster_shape_mn[1]
+
+        total = 0
+        for M, N, K, L in problem_sizes:
+            clusters_m = (M + cluster_tile_m - 1) // cluster_tile_m
+            clusters_n = (N + cluster_tile_n - 1) // cluster_tile_n
+            total += clusters_m * clusters_n
+
+        return total
+
+    def _apply_activation_and_combine(self, gate_outputs, up_outputs):
+        """Apply activation and combine gate/up outputs."""
+        return [
+            self.activation_function(gate_out) * up_out
+            for gate_out, up_out in zip(gate_outputs, up_outputs)
+        ]
+
+    def _reconstruct_output_gpu(
+        self, final_outputs, m_sizes_gpu, m_offsets_gpu, output
+    ):
+        """Reconstruct the full output tensor using GPU operations (minimal sync)."""
+        if not final_outputs:
+            return output
+
+        # Find valid experts
+        valid_mask = m_sizes_gpu > 0
+        valid_indices = torch.nonzero(valid_mask, as_tuple=True)[0]
+        valid_sizes = m_sizes_gpu[valid_indices]
+
+        # Compute offsets if not provided properly
+        if len(m_offsets_gpu) <= len(valid_indices):
+            valid_offsets = torch.cumsum(
+                torch.cat(
+                    [torch.tensor([0], device=m_sizes_gpu.device), valid_sizes[:-1]]
+                ),
+                dim=0,
+            )
+        else:
+            valid_offsets = m_offsets_gpu[valid_indices]
+
+        # Convert to CPU for final reconstruction (minimal sync)
+        valid_sizes_cpu = valid_sizes.cpu().tolist()
+        valid_offsets_cpu = valid_offsets.cpu().tolist()
+
+        for i, (size, offset) in enumerate(zip(valid_sizes_cpu, valid_offsets_cpu)):
+            if i < len(final_outputs):
+                output[offset : offset + size] = final_outputs[i]
+
+        return output
+
+    @staticmethod
+    def is_available() -> bool:
+        return HAS_CUTLASS
+
+
+class CUTLASSBackwardGroupGemm(torch.autograd.Function):
+    """
+    PyTorch autograd Function for CUTLASS grouped GEMM with backward pass support.
+
+    This function computes grouped matrix multiplication and automatically handles
+    gradient computation for both inputs and weights using the same CUTLASS kernel.
+
+    Forward: Y_i = X_i @ W_i^T for each expert i
+    Backward:
+        - dX_i = dY_i @ W_i for each expert i
+        - dW_i = dY_i^T @ X_i for each expert i
+    """
+
+    @staticmethod
+    def forward(ctx, input_tokens, weight_stack, m_sizes, m_offsets, strategy):
+        """
+        Forward pass of grouped GEMM.
+
+        Args:
+            ctx: PyTorch autograd context for saving tensors
+            input_tokens: Input tokens [total_tokens, hidden_size]
+            weight_stack: Stacked expert weights [num_experts, out_features, in_features]
+            m_sizes: Number of tokens per expert [num_experts]
+            m_offsets: Token offsets per expert [num_experts + 1]
+            strategy: CUTLASSGroupedGemmStrategy instance
+
+        Returns:
+            output: Grouped GEMM result [total_tokens, out_features]
+        """
+        # Save tensors and info for backward pass
+        ctx.save_for_backward(input_tokens, weight_stack, m_sizes, m_offsets)
+        ctx.strategy = strategy
+
+        # Ensure tensors are on GPU and contiguous
+        input_tokens = input_tokens.contiguous()
+        weight_stack = weight_stack.contiguous()
+        m_sizes_gpu = (
+            m_sizes.to(input_tokens.device) if not m_sizes.is_cuda else m_sizes
+        )
+        m_offsets_gpu = (
+            m_offsets.to(input_tokens.device) if not m_offsets.is_cuda else m_offsets
+        )
+
+        device = input_tokens.device
+        num_experts, out_features, in_features = weight_stack.shape
+        total_tokens = input_tokens.shape[0]
+
+        # Prepare output tensor
+        output = torch.zeros(
+            total_tokens, out_features, dtype=strategy.DTYPE_TORCH, device=device
+        )
+
+        # Check for valid experts
+        if not torch.any(m_sizes_gpu > 0):
+            return output
+
+        # Execute forward grouped GEMM using the strategy
+        output = CUTLASSBackwardGroupGemm._execute_forward_gemm(
+            input_tokens, weight_stack, m_sizes_gpu, m_offsets_gpu, output, strategy
+        )
+
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        """
+        Backward pass of grouped GEMM.
+
+        Args:
+            ctx: PyTorch autograd context with saved tensors
+            grad_output: Gradient w.r.t. output [total_tokens, out_features]
+
+        Returns:
+            Tuple of gradients: (grad_input, grad_weight, None, None, None)
+        """
+        input_tokens, weight_stack, m_sizes, m_offsets = ctx.saved_tensors
+        strategy = ctx.strategy
+
+        grad_output = grad_output.contiguous()
+        device = grad_output.device
+
+        # Initialize gradients
+        grad_input = torch.zeros_like(input_tokens)
+        grad_weight = torch.zeros_like(weight_stack)
+
+        m_sizes_gpu = m_sizes.to(device) if not m_sizes.is_cuda else m_sizes
+        m_offsets_gpu = m_offsets.to(device) if not m_offsets.is_cuda else m_offsets
+
+        # Check for valid experts
+        if not torch.any(m_sizes_gpu > 0):
+            return grad_input, grad_weight, None, None, None
+
+        # Compute gradient w.r.t. input: dX_i = dY_i @ W_i
+        grad_input = CUTLASSBackwardGroupGemm._execute_input_gradient_gemm(
+            grad_output, weight_stack, m_sizes_gpu, m_offsets_gpu, grad_input, strategy
+        )
+
+        # Compute gradient w.r.t. weight: dW_i = dY_i^T @ X_i
+        grad_weight = CUTLASSBackwardGroupGemm._execute_weight_gradient_gemm(
+            grad_output, input_tokens, m_sizes_gpu, m_offsets_gpu, grad_weight, strategy
+        )
+
+        return grad_input, grad_weight, None, None, None
+
+    @staticmethod
+    def _execute_forward_gemm(
+        input_tokens, weight_stack, m_sizes_gpu, m_offsets_gpu, output, strategy
+    ):
+        """Execute forward pass: Y_i = X_i @ W_i^T"""
+        # Find valid experts
+        valid_mask = m_sizes_gpu > 0
+        valid_indices = torch.nonzero(valid_mask, as_tuple=True)[0]
+
+        if len(valid_indices) == 0:
+            return output
+
+        # Prepare metadata for grouped GEMM
+        problem_sizes, strides_abc, ptrs_abc, outputs = (
+            CUTLASSBackwardGroupGemm._prepare_forward_metadata(
+                input_tokens,
+                weight_stack,
+                m_sizes_gpu,
+                m_offsets_gpu,
+                valid_indices,
+                output.device,
+            )
+        )
+
+        if len(problem_sizes) == 0:
+            return output
+
+        # Execute CUTLASS grouped GEMM
+        CUTLASSBackwardGroupGemm._execute_cutlass_kernel(
+            problem_sizes, strides_abc, ptrs_abc, output.device, strategy
+        )
+
+        # Reconstruct output
+        return CUTLASSBackwardGroupGemm._reconstruct_output(
+            outputs, m_sizes_gpu, m_offsets_gpu, output
+        )
+
+    @staticmethod
+    def _execute_input_gradient_gemm(
+        grad_output, weight_stack, m_sizes_gpu, m_offsets_gpu, grad_input, strategy
+    ):
+        """Execute input gradient: dX_i = dY_i @ W_i"""
+        # Find valid experts
+        valid_mask = m_sizes_gpu > 0
+        valid_indices = torch.nonzero(valid_mask, as_tuple=True)[0]
+
+        if len(valid_indices) == 0:
+            return grad_input
+
+        # Prepare metadata for input gradient GEMM
+        problem_sizes, strides_abc, ptrs_abc, outputs = (
+            CUTLASSBackwardGroupGemm._prepare_input_grad_metadata(
+                grad_output,
+                weight_stack,
+                m_sizes_gpu,
+                m_offsets_gpu,
+                valid_indices,
+                grad_input.device,
+            )
+        )
+
+        if len(problem_sizes) == 0:
+            return grad_input
+
+        # Execute CUTLASS grouped GEMM
+        CUTLASSBackwardGroupGemm._execute_cutlass_kernel(
+            problem_sizes, strides_abc, ptrs_abc, grad_input.device, strategy
+        )
+
+        # Reconstruct gradient
+        return CUTLASSBackwardGroupGemm._reconstruct_output(
+            outputs, m_sizes_gpu, m_offsets_gpu, grad_input
+        )
+
+    @staticmethod
+    def _execute_weight_gradient_gemm(
+        grad_output, input_tokens, m_sizes_gpu, m_offsets_gpu, grad_weight, strategy
+    ):
+        """Execute weight gradient: dW_i = dY_i^T @ X_i"""
+        # Find valid experts
+        valid_mask = m_sizes_gpu > 0
+        valid_indices = torch.nonzero(valid_mask, as_tuple=True)[0]
+
+        if len(valid_indices) == 0:
+            return grad_weight
+
+        # Prepare metadata for weight gradient GEMM
+        problem_sizes, strides_abc, ptrs_abc = (
+            CUTLASSBackwardGroupGemm._prepare_weight_grad_metadata(
+                grad_output,
+                input_tokens,
+                m_sizes_gpu,
+                m_offsets_gpu,
+                valid_indices,
+                grad_weight,
+                strategy.DTYPE_TORCH,
+            )
+        )
+
+        if len(problem_sizes) == 0:
+            return grad_weight
+
+        # Execute CUTLASS grouped GEMM
+        CUTLASSBackwardGroupGemm._execute_cutlass_kernel(
+            problem_sizes, strides_abc, ptrs_abc, grad_weight.device, strategy
+        )
+
+        return grad_weight
+
+    @staticmethod
+    def _prepare_forward_metadata(
+        input_tokens, weight_stack, m_sizes_gpu, m_offsets_gpu, valid_indices, device
+    ):
+        """Prepare metadata for forward pass: Y_i = X_i @ W_i^T"""
+        problem_sizes = []
+        strides_abc = []
+        ptrs_abc = []
+        outputs = []
+
+        # Convert to CPU for iteration (minimal sync)
+        valid_sizes = m_sizes_gpu[valid_indices]
+        valid_offsets = (
+            m_offsets_gpu[valid_indices]
+            if len(m_offsets_gpu) > len(valid_indices)
+            else torch.cumsum(
+                torch.cat([torch.tensor([0], device=device), valid_sizes[:-1]]), dim=0
+            )
+        )
+
+        valid_sizes_cpu = valid_sizes.cpu().tolist()
+        valid_offsets_cpu = valid_offsets.cpu().tolist()
+        valid_indices_cpu = valid_indices.cpu().tolist()
+
+        for expert_idx, size, offset in zip(
+            valid_indices_cpu, valid_sizes_cpu, valid_offsets_cpu
+        ):
+            if size > 0:
+                # Get expert data
+                expert_tokens = input_tokens[
+                    offset : offset + size
+                ].contiguous()  # [M, K]
+                expert_weight = weight_stack[
+                    expert_idx
+                ].contiguous()  # [N, K] - already transposed
+
+                M, K = expert_tokens.shape
+                N, K_w = expert_weight.shape
+                assert K == K_w, f"Dimension mismatch: {K} != {K_w}"
+                L = 1
+
+                # Create output tensor
+                output = torch.empty(M, N, dtype=strategy.DTYPE_TORCH, device=device)
+                outputs.append(output)
+
+                # Add to metadata: expert_tokens @ expert_weight^T
+                CUTLASSBackwardGroupGemm._add_gemm_to_metadata(
+                    expert_tokens,
+                    expert_weight,
+                    output,
+                    problem_sizes,
+                    strides_abc,
+                    ptrs_abc,
+                )
+
+        return problem_sizes, strides_abc, ptrs_abc, outputs
+
+    @staticmethod
+    def _prepare_input_grad_metadata(
+        grad_output, weight_stack, m_sizes_gpu, m_offsets_gpu, valid_indices, device
+    ):
+        """Prepare metadata for input gradient: dX_i = dY_i @ W_i"""
+        problem_sizes = []
+        strides_abc = []
+        ptrs_abc = []
+        outputs = []
+
+        # Convert to CPU for iteration (minimal sync)
+        valid_sizes = m_sizes_gpu[valid_indices]
+        valid_offsets = (
+            m_offsets_gpu[valid_indices]
+            if len(m_offsets_gpu) > len(valid_indices)
+            else torch.cumsum(
+                torch.cat([torch.tensor([0], device=device), valid_sizes[:-1]]), dim=0
+            )
+        )
+
+        valid_sizes_cpu = valid_sizes.cpu().tolist()
+        valid_offsets_cpu = valid_offsets.cpu().tolist()
+        valid_indices_cpu = valid_indices.cpu().tolist()
+
+        for expert_idx, size, offset in zip(
+            valid_indices_cpu, valid_sizes_cpu, valid_offsets_cpu
+        ):
+            if size > 0:
+                # Get expert data
+                grad_expert = grad_output[offset : offset + size].contiguous()  # [M, N]
+                expert_weight = weight_stack[expert_idx].contiguous()  # [N, K]
+
+                M, N = grad_expert.shape
+                N_w, K = expert_weight.shape
+                assert N == N_w, f"Dimension mismatch: {N} != {N_w}"
+                L = 1
+
+                # Create output tensor for gradient
+                grad_input_expert = torch.empty(
+                    M, K, dtype=strategy.DTYPE_TORCH, device=device
+                )
+                outputs.append(grad_input_expert)
+
+                # Add to metadata: grad_expert @ expert_weight (no transpose needed)
+                CUTLASSBackwardGroupGemm._add_gemm_to_metadata(
+                    grad_expert,
+                    expert_weight,
+                    grad_input_expert,
+                    problem_sizes,
+                    strides_abc,
+                    ptrs_abc,
+                )
+
+        return problem_sizes, strides_abc, ptrs_abc, outputs
+
+    @staticmethod
+    def _prepare_weight_grad_metadata(
+        grad_output,
+        input_tokens,
+        m_sizes_gpu,
+        m_offsets_gpu,
+        valid_indices,
+        grad_weight,
+        dtype,
+    ):
+        """Prepare metadata for weight gradient: dW_i = dY_i^T @ X_i"""
+        problem_sizes = []
+        strides_abc = []
+        ptrs_abc = []
+
+        # Convert to CPU for iteration (minimal sync)
+        valid_sizes = m_sizes_gpu[valid_indices]
+        valid_offsets = (
+            m_offsets_gpu[valid_indices]
+            if len(m_offsets_gpu) > len(valid_indices)
+            else torch.cumsum(
+                torch.cat(
+                    [torch.tensor([0], device=grad_weight.device), valid_sizes[:-1]]
+                ),
+                dim=0,
+            )
+        )
+
+        valid_sizes_cpu = valid_sizes.cpu().tolist()
+        valid_offsets_cpu = valid_offsets.cpu().tolist()
+        valid_indices_cpu = valid_indices.cpu().tolist()
+
+        for expert_idx, size, offset in zip(
+            valid_indices_cpu, valid_sizes_cpu, valid_offsets_cpu
+        ):
+            if size > 0:
+                # Get expert data
+                grad_expert = grad_output[offset : offset + size].contiguous()  # [M, N]
+                input_expert = input_tokens[
+                    offset : offset + size
+                ].contiguous()  # [M, K]
+
+                M, N = grad_expert.shape
+                M_i, K = input_expert.shape
+                assert M == M_i, f"Dimension mismatch: {M} != {M_i}"
+                L = 1
+
+                # Get output tensor (slice of grad_weight for this expert)
+                grad_weight_expert = grad_weight[expert_idx]  # [N, K]
+
+                # For dW = dY^T @ X, we need to transpose dY
+                # This means we compute: X^T @ dY -> (dY^T @ X)^T = dW^T, then transpose result
+                # Actually, let's compute grad_expert^T @ input_expert directly
+                grad_expert_t = grad_expert.t().contiguous()  # [N, M]
+
+                # Add to metadata: grad_expert_t @ input_expert -> grad_weight_expert
+                CUTLASSBackwardGroupGemm._add_gemm_to_metadata(
+                    grad_expert_t,
+                    input_expert,
+                    grad_weight_expert,
+                    problem_sizes,
+                    strides_abc,
+                    ptrs_abc,
+                )
+
+        return problem_sizes, strides_abc, ptrs_abc
+
+    @staticmethod
+    def _add_gemm_to_metadata(A, B, C, problem_sizes, strides_abc, ptrs_abc):
+        """Add a single GEMM operation to metadata lists."""
+        M, K = A.shape
+        K_b, N = B.shape
+        assert K == K_b, f"Inner dimension mismatch: {K} != {K_b}"
+
+        # CUTLASS expects B to be [N, K] for A @ B^T, but we have B as [K, N]
+        # So we need to transpose B or adjust our computation
+        # For A @ B -> C where A is [M, K] and B is [K, N], we need B^T which is [N, K]
+        B_transposed = B.t().contiguous()  # [N, K]
+
+        L = 1
+
+        # Convert to MNKL format
+        A_mnkl = A.unsqueeze(-1).contiguous()  # [M, K, 1]
+        B_mnkl = B_transposed.unsqueeze(-1).contiguous()  # [N, K, 1]
+        C_mnkl = C.unsqueeze(-1).contiguous()  # [M, N, 1]
+
+        # Extract strides
+        A_strides = list(A_mnkl.stride()[:2])
+        B_strides = list(B_mnkl.stride()[:2])
+        C_strides = list(C_mnkl.stride()[:2])
+
+        # Add to metadata
+        problem_sizes.append([M, N, K, L])
+        strides_abc.append([A_strides, B_strides, C_strides])
+        ptrs_abc.append([A.data_ptr(), B_transposed.data_ptr(), C.data_ptr()])
+
+    @staticmethod
+    def _execute_cutlass_kernel(problem_sizes, strides_abc, ptrs_abc, device, strategy):
+        """Execute the CUTLASS grouped GEMM kernel."""
+        num_groups = len(problem_sizes)
+
+        # Convert to CUTE tensors
+        problem_sizes_tensor = torch.tensor(
+            problem_sizes, dtype=torch.int32, device=device
+        )
+        strides_tensor = torch.tensor(strides_abc, dtype=torch.int32, device=device)
+        ptrs_tensor = torch.tensor(ptrs_abc, dtype=torch.int64, device=device)
+
+        problem_sizes_cute = from_dlpack(
+            problem_sizes_tensor, assumed_align=strategy.ALIGNMENT
+        )
+        strides_cute = from_dlpack(strides_tensor, assumed_align=strategy.ALIGNMENT)
+        ptrs_cute = from_dlpack(ptrs_tensor, assumed_align=strategy.ALIGNMENT)
+
+        # Get tensormap and compute clusters
+        tensormap_cute = strategy._get_tensormap_buffer(device)
+        total_clusters = strategy._compute_total_clusters(problem_sizes)
+
+        # Create initial tensors for compilation
+        initial_tensors = strategy._create_initial_tensors(problem_sizes[0], device)
+
+        # Get compiled kernel
+        compiled_kernel = strategy._get_compiled_kernel(
+            num_groups,
+            total_clusters,
+            initial_tensors,
+            problem_sizes_cute,
+            strides_cute,
+            ptrs_cute,
+            tensormap_cute,
+        )
+
+        # Execute kernel
+        compiled_kernel(
+            *initial_tensors,
+            problem_sizes_cute,
+            strides_cute,
+            ptrs_cute,
+            tensormap_cute,
+            strategy.stream,
+        )
+        torch.cuda.synchronize()
+
+    @staticmethod
+    def _reconstruct_output(outputs, m_sizes_gpu, m_offsets_gpu, full_output):
+        """Reconstruct full output tensor from expert results."""
+        if not outputs:
+            return full_output
+
+        # Find valid experts
+        valid_mask = m_sizes_gpu > 0
+        valid_indices = torch.nonzero(valid_mask, as_tuple=True)[0]
+        valid_sizes = m_sizes_gpu[valid_indices]
+
+        # Compute offsets
+        if len(m_offsets_gpu) <= len(valid_indices):
+            valid_offsets = torch.cumsum(
+                torch.cat(
+                    [torch.tensor([0], device=m_sizes_gpu.device), valid_sizes[:-1]]
+                ),
+                dim=0,
+            )
+        else:
+            valid_offsets = m_offsets_gpu[valid_indices]
+
+        # Convert to CPU for reconstruction (minimal sync)
+        valid_sizes_cpu = valid_sizes.cpu().tolist()
+        valid_offsets_cpu = valid_offsets.cpu().tolist()
+
+        for i, (size, offset) in enumerate(zip(valid_sizes_cpu, valid_offsets_cpu)):
+            if i < len(outputs):
+                full_output[offset : offset + size] = outputs[i]
+
+        return full_output
+
+
+class CUTLASSGroupedLinear(nn.Module):
+    """
+    A PyTorch module that wraps CUTLASS grouped GEMM with automatic differentiation.
+
+    This module performs grouped linear transformations using CUTLASS kernels,
+    with support for forward and backward passes through PyTorch autograd.
+
+    Usage:
+        layer = CUTLASSGroupedLinear(
+            num_experts=8,
+            in_features=4096,
+            out_features=11008,
+            strategy=your_cutlass_strategy
+        )
+
+        output = layer(input_tokens, expert_assignments)
+    """
+
+    def __init__(
+        self,
+        num_experts: int,
+        in_features: int,
+        out_features: int,
+        strategy,
+        bias: bool = False,
+        dtype: torch.dtype = torch.bfloat16,
+    ):
+        """
+        Initialize the CUTLASS grouped linear layer.
+
+        Args:
+            num_experts: Number of experts
+            in_features: Input feature dimension
+            out_features: Output feature dimension
+            strategy: CUTLASSGroupedGemmStrategy instance
+            bias: Whether to include bias (not yet implemented)
+            dtype: Data type for weights
+        """
+        super().__init__()
+
+        if bias:
+            raise NotImplementedError(
+                "Bias not yet implemented for CUTLASS grouped linear"
+            )
+
+        self.num_experts = num_experts
+        self.in_features = in_features
+        self.out_features = out_features
+        self.strategy = strategy
+        self.dtype = dtype
+
+        # Initialize expert weights
+        self.weight = nn.Parameter(
+            torch.empty(num_experts, out_features, in_features, dtype=dtype)
+        )
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        """Initialize parameters using standard initialization."""
+        for expert_idx in range(self.num_experts):
+            nn.init.kaiming_uniform_(self.weight[expert_idx], a=1.41421356)  # sqrt(2)
+
+    def forward(
+        self, input_tokens: torch.Tensor, expert_assignments: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Forward pass through grouped linear layer.
+
+        Args:
+            input_tokens: Input tokens [total_tokens, in_features]
+            expert_assignments: Expert assignment per token [total_tokens]
+
+        Returns:
+            output: Transformed tokens [total_tokens, out_features]
+        """
+        # Compute sizes and offsets for each expert
+        m_sizes, m_offsets = self._compute_expert_sizes_and_offsets(expert_assignments)
+
+        # Sort tokens by expert assignment for contiguous access
+        sorted_indices = torch.argsort(expert_assignments)
+        sorted_tokens = input_tokens[sorted_indices]
+
+        # Apply grouped GEMM
+        sorted_output = CUTLASSBackwardGroupGemm.apply(
+            sorted_tokens, self.weight, m_sizes, m_offsets, self.strategy
+        )
+
+        # Unsort to restore original order
+        output = torch.empty_like(sorted_output)
+        output[sorted_indices] = sorted_output
+
+        return output
+
+    def _compute_expert_sizes_and_offsets(
+        self, expert_assignments: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Compute the number of tokens assigned to each expert and their offsets.
+
+        Args:
+            expert_assignments: Expert assignment per token [total_tokens]
+
+        Returns:
+            Tuple of (sizes, offsets) tensors
+        """
+        device = expert_assignments.device
+
+        # Count tokens per expert
+        m_sizes = torch.zeros(self.num_experts, dtype=torch.int32, device=device)
+        for expert_idx in range(self.num_experts):
+            m_sizes[expert_idx] = (expert_assignments == expert_idx).sum()
+
+        # Compute cumulative offsets
+        m_offsets = torch.cat(
+            [torch.tensor([0], device=device), torch.cumsum(m_sizes, dim=0)]
+        )
+
+        return m_sizes, m_offsets
+
+    def extra_repr(self) -> str:
+        """Return string representation of module parameters."""
+        return f"num_experts={self.num_experts}, in_features={self.in_features}, out_features={self.out_features}"
+
+
+# Example usage and testing functions
+def test_cutlass_backward_group_gemm():
+    """Test the CUTLASS backward group GEMM implementation."""
+    print("Testing CUTLASS Backward Group GEMM...")
+
+    # Setup
+    device = torch.device("cuda")
+    dtype = torch.bfloat16
+
+    num_experts = 4
+    in_features = 1024
+    out_features = 2048
+    total_tokens = 512
+
+    print(
+        f"Creating strategy for {num_experts} experts, {in_features} in_features, {out_features} out_features, {total_tokens} total_tokens"
+    )
+    # Create strategy (assuming it's available)
+
+    strategy = CUTLASSGroupedGemmStrategy(
+        custom_activation=lambda x: x,  # Identity for testing
+        use_2cta_instrs=True,
+        mma_tiler_mn=(256, 128),
+        cluster_shape_mn=(2, 2),
+    )
+    print(f"Using strategy: {strategy}")
+    # Create test data
+    input_tokens = torch.randn(
+        total_tokens, in_features, dtype=dtype, device=device, requires_grad=True
+    )
+    expert_assignments = torch.randint(0, num_experts, (total_tokens,), device=device)
+
+    # Create layer
+    layer = CUTLASSGroupedLinear(
+        num_experts=num_experts,
+        in_features=in_features,
+        out_features=out_features,
+        strategy=strategy,
+        dtype=dtype,
+    )
+
+    layer = layer.to(device)
+
+    # Forward pass
+    output = layer(input_tokens, expert_assignments)
+
+    # Backward pass
+    loss = output.sum()
+    loss.backward()
+
+    # Check gradients
+    assert input_tokens.grad is not None, "Input gradient should not be None"
+    assert layer.weight.grad is not None, "Weight gradient should not be None"
+
+    print("✓ CUTLASS Backward Group GEMM test passed!")
+
+
+if __name__ == "__main__":
+    test_cutlass_backward_group_gemm()
diff --git a/torchtitan/experiments/deepseek_v3/group_gemms.py b/torchtitan/experiments/deepseek_v3/group_gemms.py
index 6c76317b6..753cb86b2 100644
--- a/torchtitan/experiments/deepseek_v3/group_gemms.py
+++ b/torchtitan/experiments/deepseek_v3/group_gemms.py
@@ -19,10 +19,10 @@
     import dsgemm_utils
 
 try:
-    from torchao.float8.config import ScalingGranularity
-    from torchao.float8.float8_utils import tensor_to_scale, to_fp8_saturated
+    # from torchao.float8.config import ScalingGranularity
+    # from torchao.float8.float8_utils import tensor_to_scale, to_fp8_saturated
 
-    TORCHAO_FP8_GG_AVAILABLE = True
+    TORCHAO_FP8_GG_AVAILABLE = False
 
 except ImportError:
     TORCHAO_FP8_GG_AVAILABLE = False
diff --git a/torchtitan/experiments/deepseek_v3/hw_info.py b/torchtitan/experiments/deepseek_v3/hw_info.py
new file mode 100644
index 000000000..da9e5a29c
--- /dev/null
+++ b/torchtitan/experiments/deepseek_v3/hw_info.py
@@ -0,0 +1,17 @@
+import cuda.bindings.driver as cuda
+import cutlass
+import cutlass.cute as cute
+import cutlass.torch as cutlass_torch
+import cutlass.utils as utils
+
+from cutlass.cute.runtime import from_dlpack
+
+cluster_shape_mn = (4, 4)
+
+hardware_info = utils.HardwareInfo()
+print(f"hardware_info: {hardware_info}")
+max_active_clusters = hardware_info.get_max_active_clusters(
+    cluster_shape_mn[0] * cluster_shape_mn[1]
+)
+
+print(f"max_active_clusters: {max_active_clusters}")

From ad9d29b91f553b7e7ad0ed753d7f5d1881c33118 Mon Sep 17 00:00:00 2001
From: lessw2020 <lessw@etrillium.com>
Date: Thu, 12 Jun 2025 20:42:18 -0700
Subject: [PATCH 12/34] initial backwards for cutlass, simple test working

---
 .../deepseek_v3/cutlass_backwards.py          | 70 +++++++++++++++++--
 1 file changed, 63 insertions(+), 7 deletions(-)

diff --git a/torchtitan/experiments/deepseek_v3/cutlass_backwards.py b/torchtitan/experiments/deepseek_v3/cutlass_backwards.py
index 624a47d97..ca902c09a 100644
--- a/torchtitan/experiments/deepseek_v3/cutlass_backwards.py
+++ b/torchtitan/experiments/deepseek_v3/cutlass_backwards.py
@@ -235,6 +235,9 @@ def _initialize_kernel(self):
 
     def _initialize_hardware(self):
         """Initialize hardware information and stream."""
+        # TODO - this is a workaround for dsl cuda context requirement
+        dummy_tensor = torch.zeros(1, device="cuda")
+        dummy_tensor.cpu()
         self.hardware_info = utils.HardwareInfo()
         self.max_active_clusters = self.hardware_info.get_max_active_clusters(
             self.cluster_shape_mn[0] * self.cluster_shape_mn[1]
@@ -915,6 +918,7 @@ def _execute_forward_gemm(
                 m_offsets_gpu,
                 valid_indices,
                 output.device,
+                strategy,
             )
         )
 
@@ -952,6 +956,7 @@ def _execute_input_gradient_gemm(
                 m_offsets_gpu,
                 valid_indices,
                 grad_input.device,
+                strategy,
             )
         )
 
@@ -1005,7 +1010,13 @@ def _execute_weight_gradient_gemm(
 
     @staticmethod
     def _prepare_forward_metadata(
-        input_tokens, weight_stack, m_sizes_gpu, m_offsets_gpu, valid_indices, device
+        input_tokens,
+        weight_stack,
+        m_sizes_gpu,
+        m_offsets_gpu,
+        valid_indices,
+        device,
+        strategy,
     ):
         """Prepare metadata for forward pass: Y_i = X_i @ W_i^T"""
         problem_sizes = []
@@ -1062,7 +1073,13 @@ def _prepare_forward_metadata(
 
     @staticmethod
     def _prepare_input_grad_metadata(
-        grad_output, weight_stack, m_sizes_gpu, m_offsets_gpu, valid_indices, device
+        grad_output,
+        weight_stack,
+        m_sizes_gpu,
+        m_offsets_gpu,
+        valid_indices,
+        device,
+        strategy,
     ):
         """Prepare metadata for input gradient: dX_i = dY_i @ W_i"""
         problem_sizes = []
@@ -1186,13 +1203,17 @@ def _prepare_weight_grad_metadata(
     def _add_gemm_to_metadata(A, B, C, problem_sizes, strides_abc, ptrs_abc):
         """Add a single GEMM operation to metadata lists."""
         M, K = A.shape
-        K_b, N = B.shape
+        # Check if B is [N, K] or [K, N] and handle accordingly
+        if B.shape[1] == K:  # B is [N, K]
+            N, K_b = B.shape
+        else:  # B is [K, N]
+            K_b, N = B.shape
+            # Transpose B for the computation
+            B = B.t().contiguous()
+
         assert K == K_b, f"Inner dimension mismatch: {K} != {K_b}"
 
-        # CUTLASS expects B to be [N, K] for A @ B^T, but we have B as [K, N]
-        # So we need to transpose B or adjust our computation
-        # For A @ B -> C where A is [M, K] and B is [K, N], we need B^T which is [N, K]
-        B_transposed = B.t().contiguous()  # [N, K]
+        B_transposed = B  # .t().contiguous()  # [N, K]
 
         L = 1
 
@@ -1416,6 +1437,41 @@ def extra_repr(self) -> str:
         return f"num_experts={self.num_experts}, in_features={self.in_features}, out_features={self.out_features}"
 
 
+def _initialize_hardware_test(self):
+    """Initialize hardware information and stream."""
+    # Force CUDA context creation by performing a simple operation
+    # This ensures the context exists before HardwareInfo queries it
+    dummy_tensor = torch.zeros(1, device="cuda")
+    dummy_tensor.cpu()  # Force synchronization to establish context
+
+    # Now it's safe to create HardwareInfo
+    hardware_info = utils.HardwareInfo()
+    max_active_clusters = self.hardware_info.get_max_active_clusters(
+        self.cluster_shape_mn[0] * self.cluster_shape_mn[1]
+    )
+
+    torch_stream = torch.cuda.current_stream()
+    self.stream = cuda.CUstream(torch_stream.cuda_stream)
+
+
+def _debug_cuda_context(self):
+    """Debug CUDA context state."""
+    try:
+        import cuda.bindings.driver as cuda_driver
+
+        try:
+            current_context = cuda_driver.cuCtxGetCurrent()
+            print(f"CUDA context exists: {current_context is not None}")
+        except:
+            print("No CUDA context found")
+
+        print(f"PyTorch CUDA available: {torch.cuda.is_available()}")
+        print(f"PyTorch CUDA initialized: {torch.cuda.is_initialized()}")
+
+    except Exception as e:
+        print(f"Debug failed: {e}")
+
+
 # Example usage and testing functions
 def test_cutlass_backward_group_gemm():
     """Test the CUTLASS backward group GEMM implementation."""

From 9a02ac0311c0825dd0a8b4463f8e719a9417cca7 Mon Sep 17 00:00:00 2001
From: lessw2020 <lessw@etrillium.com>
Date: Thu, 12 Jun 2025 20:51:39 -0700
Subject: [PATCH 13/34] backwards, add initial numerics check (failing)

---
 .../deepseek_v3/cutlass_integration.py        | 319 +++++++++
 .../deepseek_v3/cutlass_test_driver.py        | 644 ++++++++++++++++++
 2 files changed, 963 insertions(+)
 create mode 100644 torchtitan/experiments/deepseek_v3/cutlass_integration.py
 create mode 100644 torchtitan/experiments/deepseek_v3/cutlass_test_driver.py

diff --git a/torchtitan/experiments/deepseek_v3/cutlass_integration.py b/torchtitan/experiments/deepseek_v3/cutlass_integration.py
new file mode 100644
index 000000000..2de4b4cf7
--- /dev/null
+++ b/torchtitan/experiments/deepseek_v3/cutlass_integration.py
@@ -0,0 +1,319 @@
+#!/usr/bin/env python3
+"""
+Integration example showing how to use the CUTLASS test driver with the actual strategy.
+"""
+
+import os
+import sys
+
+import torch
+
+
+try:
+    import cuda.bindings.driver as cuda
+    import cutlass
+    import cutlass.cute as cute
+    import cutlass.torch as cutlass_torch
+    import cutlass.utils as utils
+
+    from cutlass.cute.runtime import from_dlpack
+    from cutlass_backwards import (
+        CUTLASSBackwardGroupGemm,
+        CUTLASSGroupedGemmStrategy,
+        CUTLASSGroupedLinear,
+    )
+
+    CUTLASS_AVAILABLE = True
+except ImportError:
+    print("CUTLASS modules not found. Please update the import paths.")
+    CUTLASS_AVAILABLE = False
+
+
+from cutlass_test_driver import GroupGemmTestDriver, PyTorchManualGroupedLinear
+
+
+def create_cutlass_strategy(
+    use_2cta_instrs=False, mma_tiler_mn=(128, 128), cluster_shape_mn=(1, 1)
+):
+    """Create a CUTLASS strategy with specified configuration."""
+    if not CUTLASS_AVAILABLE:
+        raise RuntimeError("CUTLASS not available")
+
+    strategy = CUTLASSGroupedGemmStrategy(
+        custom_activation=lambda x: x,  # Identity for linear layers
+        use_2cta_instrs=use_2cta_instrs,
+        mma_tiler_mn=mma_tiler_mn,
+        cluster_shape_mn=cluster_shape_mn,
+    )
+
+    return strategy
+
+
+def test_cutlass_vs_manual():
+    """Test CUTLASS implementation against manual PyTorch implementation."""
+    print("🧪 Testing CUTLASS vs Manual PyTorch Implementation")
+    print("=" * 60)
+
+    # Configuration
+    device = torch.device("cuda")
+    dtype = torch.bfloat16
+
+    num_experts = 8
+    total_tokens = 1024
+    in_features = 2048
+    out_features = 4096
+
+    # Generate test data
+    input_tokens = torch.randn(
+        total_tokens, in_features, dtype=dtype, device=device, requires_grad=True
+    )
+    expert_assignments = torch.randint(0, num_experts, (total_tokens,), device=device)
+
+    # Create strategy and layers
+    if CUTLASS_AVAILABLE:
+        strategy = create_cutlass_strategy()
+        cutlass_layer = CUTLASSGroupedLinear(
+            num_experts=num_experts,
+            in_features=in_features,
+            out_features=out_features,
+            strategy=strategy,
+            dtype=dtype,
+        ).to(device)
+    else:
+        print("❌ CUTLASS not available, skipping CUTLASS tests")
+        return
+
+    manual_layer = PyTorchManualGroupedLinear(
+        num_experts=num_experts,
+        in_features=in_features,
+        out_features=out_features,
+        dtype=dtype,
+    ).to(device)
+
+    # Copy weights to ensure fair comparison
+    cutlass_layer.weight.data.copy_(manual_layer.weight.data)
+
+    print("🔍 Testing Forward Pass...")
+
+    # Forward pass
+    input_manual = input_tokens.clone().detach().requires_grad_(True)
+    input_cutlass = input_tokens.clone().detach().requires_grad_(True)
+
+    output_manual = manual_layer(input_manual, expert_assignments)
+    output_cutlass = cutlass_layer(input_cutlass, expert_assignments)
+
+    # Check forward pass
+    forward_diff = torch.abs(output_manual - output_cutlass).max().item()
+    forward_close = torch.allclose(output_manual, output_cutlass, rtol=1e-3, atol=1e-3)
+
+    print(f"   Forward max difference: {forward_diff:.2e}")
+    print(f"   Forward outputs close: {'✓' if forward_close else '❌'}")
+
+    print("🔍 Testing Backward Pass...")
+
+    # Backward pass
+    loss_manual = output_manual.sum()
+    loss_cutlass = output_cutlass.sum()
+
+    loss_manual.backward()
+    loss_cutlass.backward()
+
+    # Check gradients
+    input_grad_diff = torch.abs(input_manual.grad - input_cutlass.grad).max().item()
+    input_grad_close = torch.allclose(
+        input_manual.grad, input_cutlass.grad, rtol=1e-3, atol=1e-3
+    )
+
+    weight_grad_diff = (
+        torch.abs(manual_layer.weight.grad - cutlass_layer.weight.grad).max().item()
+    )
+    weight_grad_close = torch.allclose(
+        manual_layer.weight.grad, cutlass_layer.weight.grad, rtol=1e-3, atol=1e-3
+    )
+
+    print(f"   Input grad max difference: {input_grad_diff:.2e}")
+    print(f"   Input gradients close: {'✓' if input_grad_close else '❌'}")
+    print(f"   Weight grad max difference: {weight_grad_diff:.2e}")
+    print(f"   Weight gradients close: {'✓' if weight_grad_close else '❌'}")
+
+    # Overall result
+    all_correct = forward_close and input_grad_close and weight_grad_close
+    print(f"\n🎯 Overall Result: {'✅ PASS' if all_correct else '❌ FAIL'}")
+
+    return all_correct
+
+
+def benchmark_cutlass_vs_manual():
+    """Benchmark CUTLASS vs manual implementation."""
+    if not CUTLASS_AVAILABLE:
+        print("❌ CUTLASS not available, cannot run benchmarks")
+        return
+
+    print("\n🚀 Benchmarking CUTLASS vs Manual Implementation")
+    print("=" * 60)
+
+    # Import triton for benchmarking
+    try:
+        from triton.testing import do_bench
+    except ImportError:
+        print("❌ Triton not available, using basic timing")
+        do_bench = None
+
+    # Test configurations
+    configs = [
+        {
+            "num_experts": 8,
+            "total_tokens": 1024,
+            "in_features": 2048,
+            "out_features": 4096,
+            "name": "Medium",
+        },
+        {
+            "num_experts": 8,
+            "total_tokens": 2048,
+            "in_features": 4096,
+            "out_features": 11008,
+            "name": "MoE-7B",
+        },
+        {
+            "num_experts": 64,
+            "total_tokens": 4096,
+            "in_features": 4096,
+            "out_features": 11008,
+            "name": "MoE-Large",
+        },
+    ]
+
+    device = torch.device("cuda")
+    dtype = torch.bfloat16
+
+    for config in configs:
+        print(
+            f"\n📊 {config['name']}: {config['num_experts']} experts, {config['total_tokens']} tokens"
+        )
+
+        # Setup
+        num_experts = config["num_experts"]
+        total_tokens = config["total_tokens"]
+        in_features = config["in_features"]
+        out_features = config["out_features"]
+
+        # Create test data
+        input_tokens = torch.randn(
+            total_tokens, in_features, dtype=dtype, device=device, requires_grad=True
+        )
+        expert_assignments = torch.randint(
+            0, num_experts, (total_tokens,), device=device
+        )
+
+        # Create layers
+        strategy = create_cutlass_strategy()
+        cutlass_layer = CUTLASSGroupedLinear(
+            num_experts, in_features, out_features, strategy, dtype
+        ).to(device)
+        manual_layer = PyTorchManualGroupedLinear(
+            num_experts, in_features, out_features, dtype
+        ).to(device)
+
+        # Copy weights
+        cutlass_layer.weight.data.copy_(manual_layer.weight.data)
+
+        # Benchmark functions
+        def manual_forward():
+            return manual_layer(input_tokens, expert_assignments)
+
+        def cutlass_forward():
+            return cutlass_layer(input_tokens, expert_assignments)
+
+        def manual_backward():
+            input_clone = input_tokens.clone().detach().requires_grad_(True)
+            manual_layer.zero_grad()
+            output = manual_layer(input_clone, expert_assignments)
+            loss = output.sum()
+            loss.backward()
+            return loss
+
+        def cutlass_backward():
+            input_clone = input_tokens.clone().detach().requires_grad_(True)
+            cutlass_layer.zero_grad()
+            output = cutlass_layer(input_clone, expert_assignments)
+            loss = output.sum()
+            loss.backward()
+            return loss
+
+        # Run benchmarks
+        if do_bench:
+            manual_fwd_time = do_bench(manual_forward, warmup=5, rep=10)
+            cutlass_fwd_time = do_bench(cutlass_forward, warmup=5, rep=10)
+            manual_bwd_time = do_bench(manual_backward, warmup=5, rep=10)
+            cutlass_bwd_time = do_bench(cutlass_backward, warmup=5, rep=10)
+        else:
+            # Basic timing fallback
+            import time
+
+            # Forward timing
+            torch.cuda.synchronize()
+            start = time.time()
+            for _ in range(10):
+                manual_forward()
+            torch.cuda.synchronize()
+            manual_fwd_time = (time.time() - start) / 10 * 1000
+
+            torch.cuda.synchronize()
+            start = time.time()
+            for _ in range(10):
+                cutlass_forward()
+            torch.cuda.synchronize()
+            cutlass_fwd_time = (time.time() - start) / 10 * 1000
+
+            # Backward timing
+            torch.cuda.synchronize()
+            start = time.time()
+            for _ in range(10):
+                manual_backward()
+            torch.cuda.synchronize()
+            manual_bwd_time = (time.time() - start) / 10 * 1000
+
+            torch.cuda.synchronize()
+            start = time.time()
+            for _ in range(10):
+                cutlass_backward()
+            torch.cuda.synchronize()
+            cutlass_bwd_time = (time.time() - start) / 10 * 1000
+
+        # Calculate speedups
+        fwd_speedup = (
+            manual_fwd_time / cutlass_fwd_time if cutlass_fwd_time > 0 else float("inf")
+        )
+        bwd_speedup = (
+            manual_bwd_time / cutlass_bwd_time if cutlass_bwd_time > 0 else float("inf")
+        )
+
+        print(
+            f"   Forward:  Manual={manual_fwd_time:.2f}ms, CUTLASS={cutlass_fwd_time:.2f}ms, Speedup={fwd_speedup:.2f}x"
+        )
+        print(
+            f"   Backward: Manual={manual_bwd_time:.2f}ms, CUTLASS={cutlass_bwd_time:.2f}ms, Speedup={bwd_speedup:.2f}x"
+        )
+
+
+def main():
+    """Main integration test."""
+    print("🎯 CUTLASS Group GEMM Integration Test")
+
+    # Test numerical correctness
+    if CUTLASS_AVAILABLE:
+        test_cutlass_vs_manual()
+
+        # Benchmark performance
+        benchmark_cutlass_vs_manual()
+    else:
+        print("❌ CUTLASS not available. Please ensure:")
+        print("   1. CUTLASS Python bindings are installed")
+        print("   2. cutlass_backward_group_gemm.py is available")
+        print("   3. cutlass_strategy.py is available")
+        print("   4. Update import paths in this script")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/torchtitan/experiments/deepseek_v3/cutlass_test_driver.py b/torchtitan/experiments/deepseek_v3/cutlass_test_driver.py
new file mode 100644
index 000000000..fd2b97fda
--- /dev/null
+++ b/torchtitan/experiments/deepseek_v3/cutlass_test_driver.py
@@ -0,0 +1,644 @@
+import argparse
+import time
+from typing import List, Tuple
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+# Triton benchmarking
+try:
+    import triton
+    from triton.testing import do_bench
+
+    TRITON_AVAILABLE = True
+except ImportError:
+    TRITON_AVAILABLE = False
+    print("Warning: Triton not available, using basic timing")
+
+# Import CUTLASS components (assuming they're available)
+try:
+    import cuda.bindings.driver as cuda
+    import cutlass
+    import cutlass.cute as cute
+    import cutlass.torch as cutlass_torch
+    import cutlass.utils as utils
+
+    from cutlass.cute.runtime import from_dlpack
+    from cutlass_backward_group_gemm import (
+        CUTLASSBackwardGroupGemm,
+        CUTLASSGroupedGemmStrategy,
+        CUTLASSGroupedLinear,
+    )
+except ImportError:
+    print("CUTLASS modules not found. Please update the import paths.")
+    CUTLASS_AVAILABLE = False
+
+
+class PyTorchManualGroupGemm(torch.autograd.Function):
+    """
+    Reference implementation using manual PyTorch loops for comparison.
+    """
+
+    @staticmethod
+    def forward(ctx, input_tokens, weight_stack, m_sizes, m_offsets):
+        """Manual forward pass using PyTorch loops."""
+        ctx.save_for_backward(input_tokens, weight_stack, m_sizes, m_offsets)
+
+        device = input_tokens.device
+        total_tokens, in_features = input_tokens.shape
+        num_experts, out_features, _ = weight_stack.shape
+
+        output = torch.zeros(
+            total_tokens, out_features, dtype=input_tokens.dtype, device=device
+        )
+
+        # Manual loop over experts
+        offset = 0
+        for expert_idx, size in enumerate(m_sizes.cpu().tolist()):
+            if size > 0:
+                # Get tokens for this expert
+                expert_tokens = input_tokens[
+                    offset : offset + size
+                ]  # [size, in_features]
+                expert_weight = weight_stack[expert_idx]  # [out_features, in_features]
+
+                # Compute: expert_tokens @ expert_weight.T
+                expert_output = torch.mm(
+                    expert_tokens, expert_weight.t()
+                )  # [size, out_features]
+
+                # Store results
+                output[offset : offset + size] = expert_output
+
+            offset += size
+
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        """Manual backward pass using PyTorch loops."""
+        input_tokens, weight_stack, m_sizes, m_offsets = ctx.saved_tensors
+
+        device = grad_output.device
+        grad_input = torch.zeros_like(input_tokens)
+        grad_weight = torch.zeros_like(weight_stack)
+
+        # Manual loop over experts
+        offset = 0
+        for expert_idx, size in enumerate(m_sizes.cpu().tolist()):
+            if size > 0:
+                # Get gradients for this expert
+                grad_expert = grad_output[
+                    offset : offset + size
+                ]  # [size, out_features]
+                expert_tokens = input_tokens[
+                    offset : offset + size
+                ]  # [size, in_features]
+                expert_weight = weight_stack[expert_idx]  # [out_features, in_features]
+
+                # Input gradient: grad_expert @ expert_weight
+                grad_input[offset : offset + size] = torch.mm(
+                    grad_expert, expert_weight
+                )
+
+                # Weight gradient: grad_expert.T @ expert_tokens
+                grad_weight[expert_idx] = torch.mm(grad_expert.t(), expert_tokens)
+
+            offset += size
+
+        return grad_input, grad_weight, None, None
+
+
+class PyTorchManualGroupedLinear(nn.Module):
+    """Reference grouped linear layer using manual loops."""
+
+    def __init__(
+        self,
+        num_experts: int,
+        in_features: int,
+        out_features: int,
+        dtype: torch.dtype = torch.bfloat16,
+    ):
+        super().__init__()
+        self.num_experts = num_experts
+        self.in_features = in_features
+        self.out_features = out_features
+        self.dtype = dtype
+
+        self.weight = nn.Parameter(
+            torch.empty(num_experts, out_features, in_features, dtype=dtype)
+        )
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        for expert_idx in range(self.num_experts):
+            nn.init.kaiming_uniform_(self.weight[expert_idx], a=1.41421356)
+
+    def forward(
+        self, input_tokens: torch.Tensor, expert_assignments: torch.Tensor
+    ) -> torch.Tensor:
+        m_sizes, m_offsets = self._compute_expert_sizes_and_offsets(expert_assignments)
+
+        # Sort tokens by expert assignment
+        sorted_indices = torch.argsort(expert_assignments)
+        sorted_tokens = input_tokens[sorted_indices]
+
+        # Apply manual grouped GEMM
+        sorted_output = PyTorchManualGroupGemm.apply(
+            sorted_tokens, self.weight, m_sizes, m_offsets
+        )
+
+        # Unsort to restore original order
+        output = torch.empty_like(sorted_output)
+        output[sorted_indices] = sorted_output
+
+        return output
+
+    def _compute_expert_sizes_and_offsets(
+        self, expert_assignments: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        device = expert_assignments.device
+        m_sizes = torch.zeros(self.num_experts, dtype=torch.int32, device=device)
+
+        for expert_idx in range(self.num_experts):
+            m_sizes[expert_idx] = (expert_assignments == expert_idx).sum()
+
+        m_offsets = torch.cat(
+            [torch.tensor([0], device=device), torch.cumsum(m_sizes, dim=0)]
+        )
+        return m_sizes, m_offsets
+
+
+class GroupGemmTestDriver:
+    """Test driver for comparing CUTLASS vs PyTorch manual implementation."""
+
+    def __init__(self, device="cuda", dtype=torch.bfloat16):
+        self.device = torch.device(device)
+        self.dtype = dtype
+
+    def generate_test_data(
+        self,
+        num_experts: int,
+        total_tokens: int,
+        in_features: int,
+        out_features: int,
+        expert_balance: str = "uniform",
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Generate test data for benchmarking."""
+
+        # Create input tokens
+        input_tokens = torch.randn(
+            total_tokens,
+            in_features,
+            dtype=self.dtype,
+            device=self.device,
+            requires_grad=True,
+        )
+
+        # Create expert assignments
+        if expert_balance == "uniform":
+            expert_assignments = torch.randint(
+                0, num_experts, (total_tokens,), device=self.device
+            )
+        elif expert_balance == "imbalanced":
+            # Create imbalanced distribution (some experts get more tokens)
+            probs = torch.tensor([0.4, 0.3, 0.2, 0.1] + [0.0] * (num_experts - 4))[
+                :num_experts
+            ]
+            probs = probs / probs.sum()
+            expert_assignments = torch.multinomial(
+                probs, total_tokens, replacement=True
+            ).to(self.device)
+        elif expert_balance == "sparse":
+            # Only use first half of experts
+            expert_assignments = torch.randint(
+                0, num_experts // 2, (total_tokens,), device=self.device
+            )
+        else:
+            raise ValueError(f"Unknown expert_balance: {expert_balance}")
+
+        return input_tokens, expert_assignments
+
+    def test_numerical_correctness(
+        self,
+        num_experts=4,
+        total_tokens=256,
+        in_features=512,
+        out_features=1024,
+        rtol=1e-3,
+        atol=1e-3,
+    ):
+        """Test numerical correctness between CUTLASS and PyTorch manual implementations."""
+        print(f"\n🧮 Testing Numerical Correctness")
+        print(
+            f"   Problem size: {num_experts} experts, {total_tokens} tokens, {in_features}→{out_features}"
+        )
+
+        # Generate test data
+        input_tokens, expert_assignments = self.generate_test_data(
+            num_experts, total_tokens, in_features, out_features
+        )
+
+        # Create both implementations with same weights
+        manual_layer = PyTorchManualGroupedLinear(
+            num_experts, in_features, out_features, self.dtype
+        ).to(self.device)
+
+        # For now, we'll test the manual implementation against itself to verify the test setup
+        # When CUTLASS implementation is available, we'll copy weights and compare
+
+        # cutlass_layer = CUTLASSGroupedLinear(num_experts, in_features, out_features, strategy, dtype=self.dtype).to(self.device)
+        # cutlass_layer.weight.data.copy_(manual_layer.weight.data)
+
+        print("   Testing forward pass...")
+
+        # Forward pass - Manual
+        input_tokens_manual = input_tokens.clone().detach().requires_grad_(True)
+        output_manual = manual_layer(input_tokens_manual, expert_assignments)
+
+        # For now, test manual against manual (placeholder for CUTLASS comparison)
+        input_tokens_cutlass = input_tokens.clone().detach().requires_grad_(True)
+        output_cutlass = manual_layer(
+            input_tokens_cutlass, expert_assignments
+        )  # Replace with cutlass_layer when available
+
+        # Check forward pass
+        forward_diff = torch.abs(output_manual - output_cutlass).max().item()
+        forward_close = torch.allclose(
+            output_manual, output_cutlass, rtol=rtol, atol=atol
+        )
+
+        print(f"   ✓ Forward pass max diff: {forward_diff:.2e}")
+        print(f"   ✓ Forward pass close: {forward_close}")
+
+        print("   Testing backward pass...")
+
+        # Backward pass
+        loss_manual = output_manual.sum()
+        loss_cutlass = output_cutlass.sum()
+
+        loss_manual.backward()
+        loss_cutlass.backward()
+
+        # Check input gradients
+        if (
+            input_tokens_manual.grad is not None
+            and input_tokens_cutlass.grad is not None
+        ):
+            input_grad_diff = (
+                torch.abs(input_tokens_manual.grad - input_tokens_cutlass.grad)
+                .max()
+                .item()
+            )
+            input_grad_close = torch.allclose(
+                input_tokens_manual.grad,
+                input_tokens_cutlass.grad,
+                rtol=rtol,
+                atol=atol,
+            )
+
+            print(f"   ✓ Input gradient max diff: {input_grad_diff:.2e}")
+            print(f"   ✓ Input gradient close: {input_grad_close}")
+
+        # Check weight gradients
+        if (
+            manual_layer.weight.grad is not None
+        ):  # and cutlass_layer.weight.grad is not None:
+            weight_grad_diff = (
+                torch.abs(manual_layer.weight.grad - manual_layer.weight.grad)
+                .max()
+                .item()
+            )  # Replace with cutlass comparison
+            weight_grad_close = True  # Replace with actual comparison
+
+            print(f"   ✓ Weight gradient max diff: {weight_grad_diff:.2e}")
+            print(f"   ✓ Weight gradient close: {weight_grad_close}")
+
+        return forward_close and input_grad_close and weight_grad_close
+
+    def benchmark_forward_pass(self, config: dict, warmup=5, reps=10):
+        """Benchmark forward pass performance."""
+        num_experts = config["num_experts"]
+        total_tokens = config["total_tokens"]
+        in_features = config["in_features"]
+        out_features = config["out_features"]
+
+        # Generate test data
+        input_tokens, expert_assignments = self.generate_test_data(
+            num_experts, total_tokens, in_features, out_features
+        )
+
+        # Create layers
+        manual_layer = PyTorchManualGroupedLinear(
+            num_experts, in_features, out_features, self.dtype
+        ).to(self.device)
+        # cutlass_layer = CUTLASSGroupedLinear(num_experts, in_features, out_features, strategy, dtype=self.dtype).to(self.device)
+        # cutlass_layer.weight.data.copy_(manual_layer.weight.data)
+
+        def manual_forward():
+            return manual_layer(input_tokens, expert_assignments)
+
+        def cutlass_forward():
+            return manual_layer(
+                input_tokens, expert_assignments
+            )  # Replace with cutlass_layer when available
+
+        # Benchmark using Triton if available
+        if TRITON_AVAILABLE:
+            manual_time = do_bench(manual_forward, warmup=warmup, rep=reps)
+            cutlass_time = do_bench(cutlass_forward, warmup=warmup, rep=reps)
+        else:
+            # Fallback timing
+            manual_time = self._basic_benchmark(manual_forward, warmup, reps)
+            cutlass_time = self._basic_benchmark(cutlass_forward, warmup, reps)
+
+        return {
+            "manual_time": manual_time,
+            "cutlass_time": cutlass_time,
+            "speedup": manual_time / cutlass_time if cutlass_time > 0 else float("inf"),
+        }
+
+    def benchmark_backward_pass(self, config: dict, warmup=5, reps=10):
+        """Benchmark backward pass performance."""
+        num_experts = config["num_experts"]
+        total_tokens = config["total_tokens"]
+        in_features = config["in_features"]
+        out_features = config["out_features"]
+
+        # Generate test data
+        input_tokens, expert_assignments = self.generate_test_data(
+            num_experts, total_tokens, in_features, out_features
+        )
+
+        # Create layers
+        manual_layer = PyTorchManualGroupedLinear(
+            num_experts, in_features, out_features, self.dtype
+        ).to(self.device)
+        # cutlass_layer = CUTLASSGroupedLinear(num_experts, in_features, out_features, strategy, dtype=self.dtype).to(self.device)
+
+        def manual_backward():
+            input_tokens_clone = input_tokens.clone().detach().requires_grad_(True)
+            manual_layer.zero_grad()
+            output = manual_layer(input_tokens_clone, expert_assignments)
+            loss = output.sum()
+            loss.backward()
+            return loss
+
+        def cutlass_backward():
+            input_tokens_clone = input_tokens.clone().detach().requires_grad_(True)
+            manual_layer.zero_grad()  # Replace with cutlass_layer when available
+            output = manual_layer(input_tokens_clone, expert_assignments)
+            loss = output.sum()
+            loss.backward()
+            return loss
+
+        # Benchmark using Triton if available
+        if TRITON_AVAILABLE:
+            manual_time = do_bench(manual_backward, warmup=warmup, rep=reps)
+            cutlass_time = do_bench(cutlass_backward, warmup=warmup, rep=reps)
+        else:
+            manual_time = self._basic_benchmark(manual_backward, warmup, reps)
+            cutlass_time = self._basic_benchmark(cutlass_backward, warmup, reps)
+
+        return {
+            "manual_time": manual_time,
+            "cutlass_time": cutlass_time,
+            "speedup": manual_time / cutlass_time if cutlass_time > 0 else float("inf"),
+        }
+
+    def _basic_benchmark(self, func, warmup, reps):
+        """Basic timing fallback when Triton is not available."""
+        # Warmup
+        for _ in range(warmup):
+            func()
+        torch.cuda.synchronize()
+
+        # Timing
+        start_time = time.time()
+        for _ in range(reps):
+            func()
+        torch.cuda.synchronize()
+        end_time = time.time()
+
+        return (end_time - start_time) / reps * 1000  # Convert to ms
+
+    def run_comprehensive_benchmark(self):
+        """Run comprehensive benchmarks across different problem sizes."""
+        print("🚀 Running Comprehensive Group GEMM Benchmarks")
+        print("=" * 60)
+
+        # Test configurations
+        configs = [
+            # Small problems
+            {
+                "num_experts": 4,
+                "total_tokens": 256,
+                "in_features": 512,
+                "out_features": 1024,
+                "name": "Small",
+            },
+            # Medium problems
+            {
+                "num_experts": 8,
+                "total_tokens": 512,
+                "in_features": 1024,
+                "out_features": 2048,
+                "name": "Medium",
+            },
+            # Large problems
+            {
+                "num_experts": 8,
+                "total_tokens": 1024,
+                "in_features": 2048,
+                "out_features": 4096,
+                "name": "Large",
+            },
+            # MoE-like problems
+            {
+                "num_experts": 8,
+                "total_tokens": 2048,
+                "in_features": 4096,
+                "out_features": 11008,
+                "name": "MoE-7B",
+            },
+            {
+                "num_experts": 64,
+                "total_tokens": 4096,
+                "in_features": 4096,
+                "out_features": 11008,
+                "name": "MoE-Large",
+            },
+        ]
+
+        results = []
+
+        for config in configs:
+            print(f"\n📊 Benchmarking {config['name']} Configuration")
+            print(
+                f"   {config['num_experts']} experts, {config['total_tokens']} tokens, {config['in_features']}→{config['out_features']}"
+            )
+
+            try:
+                # Test numerical correctness first
+                correct = self.test_numerical_correctness(
+                    **{k: v for k, v in config.items() if k != "name"}
+                )
+
+                # Benchmark forward pass
+                print("   Benchmarking forward pass...")
+                forward_results = self.benchmark_forward_pass(config)
+
+                # Benchmark backward pass
+                print("   Benchmarking backward pass...")
+                backward_results = self.benchmark_backward_pass(config)
+
+                # Store results
+                result = {
+                    "config": config,
+                    "correct": correct,
+                    "forward": forward_results,
+                    "backward": backward_results,
+                }
+                results.append(result)
+
+                # Print summary
+                print(f"   ✓ Numerical correctness: {correct}")
+                print(
+                    f"   ✓ Forward:  Manual={forward_results['manual_time']:.2f}ms, CUTLASS={forward_results['cutlass_time']:.2f}ms, Speedup={forward_results['speedup']:.2f}x"
+                )
+                print(
+                    f"   ✓ Backward: Manual={backward_results['manual_time']:.2f}ms, CUTLASS={backward_results['cutlass_time']:.2f}ms, Speedup={backward_results['speedup']:.2f}x"
+                )
+
+            except Exception as e:
+                print(f"   ❌ Error: {e}")
+                continue
+
+        # Print final summary
+        self.print_benchmark_summary(results)
+
+        return results
+
+    def print_benchmark_summary(self, results):
+        """Print a formatted summary of benchmark results."""
+        print("\n" + "=" * 80)
+        print("📈 BENCHMARK SUMMARY")
+        print("=" * 80)
+
+        header = f"{'Config':<12} {'Correct':<8} {'Fwd Manual':<10} {'Fwd CUTLASS':<12} {'Fwd Speedup':<11} {'Bwd Manual':<10} {'Bwd CUTLASS':<12} {'Bwd Speedup':<11}"
+        print(header)
+        print("-" * len(header))
+
+        for result in results:
+            config = result["config"]
+            correct = "✓" if result["correct"] else "❌"
+
+            fwd = result["forward"]
+            bwd = result["backward"]
+
+            print(
+                f"{config['name']:<12} {correct:<8} {fwd['manual_time']:<10.2f} {fwd['cutlass_time']:<12.2f} {fwd['speedup']:<11.2f} {bwd['manual_time']:<10.2f} {bwd['cutlass_time']:<12.2f} {bwd['speedup']:<11.2f}"
+            )
+
+        # Calculate average speedups
+        if results:
+            avg_fwd_speedup = np.mean(
+                [
+                    r["forward"]["speedup"]
+                    for r in results
+                    if r["forward"]["speedup"] != float("inf")
+                ]
+            )
+            avg_bwd_speedup = np.mean(
+                [
+                    r["backward"]["speedup"]
+                    for r in results
+                    if r["backward"]["speedup"] != float("inf")
+                ]
+            )
+
+            print(f"\n🎯 Average Speedups:")
+            print(f"   Forward:  {avg_fwd_speedup:.2f}x")
+            print(f"   Backward: {avg_bwd_speedup:.2f}x")
+
+
+def main():
+    """Main test driver entry point."""
+    parser = argparse.ArgumentParser(description="CUTLASS Group GEMM Test Driver")
+    parser.add_argument("--device", default="cuda", help="Device to run on")
+    parser.add_argument(
+        "--dtype",
+        default="bfloat16",
+        choices=["float16", "bfloat16", "float32"],
+        help="Data type",
+    )
+    parser.add_argument("--num-experts", type=int, default=8, help="Number of experts")
+    parser.add_argument("--total-tokens", type=int, default=1024, help="Total tokens")
+    parser.add_argument("--in-features", type=int, default=2048, help="Input features")
+    parser.add_argument(
+        "--out-features", type=int, default=4096, help="Output features"
+    )
+    parser.add_argument(
+        "--test-correctness",
+        action="store_true",
+        help="Test numerical correctness only",
+    )
+    parser.add_argument(
+        "--benchmark-only", action="store_true", help="Run benchmarks only"
+    )
+
+    args = parser.parse_args()
+
+    # Setup
+    dtype_map = {
+        "float16": torch.float16,
+        "bfloat16": torch.bfloat16,
+        "float32": torch.float32,
+    }
+    dtype = dtype_map[args.dtype]
+
+    # Initialize test driver
+    test_driver = GroupGemmTestDriver(device=args.device, dtype=dtype)
+
+    print("🧪 CUTLASS Group GEMM Test Driver")
+    print(
+        f"Device: {args.device}, Dtype: {args.dtype}, Triton: {'✓' if TRITON_AVAILABLE else '❌'}"
+    )
+
+    if args.test_correctness:
+        # Test numerical correctness only
+        correct = test_driver.test_numerical_correctness(
+            args.num_experts, args.total_tokens, args.in_features, args.out_features
+        )
+        print(f"\n🎯 Overall correctness: {'✓ PASS' if correct else '❌ FAIL'}")
+
+    elif args.benchmark_only:
+        # Run single benchmark
+        config = {
+            "num_experts": args.num_experts,
+            "total_tokens": args.total_tokens,
+            "in_features": args.in_features,
+            "out_features": args.out_features,
+            "name": "Custom",
+        }
+
+        forward_results = test_driver.benchmark_forward_pass(config)
+        backward_results = test_driver.benchmark_backward_pass(config)
+
+        print(f"\n📊 Single Benchmark Results:")
+        print(
+            f"Forward:  Manual={forward_results['manual_time']:.2f}ms, CUTLASS={forward_results['cutlass_time']:.2f}ms, Speedup={forward_results['speedup']:.2f}x"
+        )
+        print(
+            f"Backward: Manual={backward_results['manual_time']:.2f}ms, CUTLASS={backward_results['cutlass_time']:.2f}ms, Speedup={backward_results['speedup']:.2f}x"
+        )
+
+    else:
+        # Run comprehensive benchmarks
+        results = test_driver.run_comprehensive_benchmark()
+
+
+if __name__ == "__main__":
+    main()

From a4b35c3c0a353a9cba6f70ca860615e56ef50a3c Mon Sep 17 00:00:00 2001
From: lessw2020 <lessw@etrillium.com>
Date: Thu, 12 Jun 2025 20:52:15 -0700
Subject: [PATCH 14/34] backwards, add initial numerics check (failing)

---
 .../deepseek_v3/cutlass_integration.py        | 59 ++++++++++++++++++-
 1 file changed, 58 insertions(+), 1 deletion(-)

diff --git a/torchtitan/experiments/deepseek_v3/cutlass_integration.py b/torchtitan/experiments/deepseek_v3/cutlass_integration.py
index 2de4b4cf7..fc2912193 100644
--- a/torchtitan/experiments/deepseek_v3/cutlass_integration.py
+++ b/torchtitan/experiments/deepseek_v3/cutlass_integration.py
@@ -1,6 +1,63 @@
 #!/usr/bin/env python3
 """
-Integration example showing how to use the CUTLASS test driver with the actual strategy.
+Integration testing
+
+current errors:
+============================================================
+Initializing CUTLASSGroupedGemmStrategy for Blackwell
+cute hardware - device_id 0
+cute hardware - driver_version 12080
+max_dynamic_shared_memory: 232448
+max_active_blocks: 1
+Initialized CUTLASSGroupedGemmStrategy for Blackwell with:
+  - 2 CTA instructions: False
+  - MMA tiler (M, N): (128, 128)
+  - Cluster shape (M, N): (1, 1)
+  - Cluster size: 1
+🔍 Testing Forward Pass...
+max_dynamic_shared_memory: 232448
+max_active_blocks: 1
+Compiling CUTLASS grouped GEMM kernel: 8 groups, 2CTA=False, cluster=(1, 1)
+Kernel compilation successful
+   Forward max difference: 0.00e+00
+   Forward outputs close: ✓
+🔍 Testing Backward Pass...
+Compiling CUTLASS grouped GEMM kernel: 8 groups, 2CTA=False, cluster=(1, 1)
+Kernel compilation successful
+Compiling CUTLASS grouped GEMM kernel: 8 groups, 2CTA=False, cluster=(1, 1)
+Kernel compilation successful
+   Input grad max difference: 2.21e+02
+   Input gradients close: ❌
+   Weight grad max difference: 8.55e+01
+   Weight gradients close: ❌
+
+🎯 Overall Result: ❌ FAIL
+
+🚀 Benchmarking CUTLASS vs Manual Implementation
+============================================================
+
+📊 Medium: 8 experts, 1024 tokens
+Initializing CUTLASSGroupedGemmStrategy for Blackwell
+cute hardware - device_id 0
+cute hardware - driver_version 12080
+max_dynamic_shared_memory: 232448
+max_active_blocks: 1
+Initialized CUTLASSGroupedGemmStrategy for Blackwell with:
+  - 2 CTA instructions: False
+  - MMA tiler (M, N): (128, 128)
+  - Cluster shape (M, N): (1, 1)
+  - Cluster size: 1
+Traceback (most recent call last):
+  File "/data/users/less/torchtitan/torchtitan/experiments/deepseek_v3/cutlass_integration.py", line 319, in <module>
+    main()
+  File "/data/users/less/torchtitan/torchtitan/experiments/deepseek_v3/cutlass_integration.py", line 309, in main
+    benchmark_cutlass_vs_manual()
+  File "/data/users/less/torchtitan/torchtitan/experiments/deepseek_v3/cutlass_integration.py", line 211, in benchmark_cutlass_vs_manual
+    cutlass_layer = CUTLASSGroupedLinear(
+                    ^^^^^^^^^^^^^^^^^^^^^
+  File "/data/users/less/torchtitan/torchtitan/experiments/deepseek_v3/cutlass_backwards.py", line 1356, in __init__
+    raise NotImplementedError(
+NotImplementedError: Bias not yet implemented for CUTLASS grouped linear
 """
 
 import os

From 0c5b84ccad277d4ec443ca644e1b4c81532a5931 Mon Sep 17 00:00:00 2001
From: lessw2020 <lessw@etrillium.com>
Date: Thu, 12 Jun 2025 23:16:01 -0700
Subject: [PATCH 15/34] progress on backwards, still failing

---
 .../deepseek_v3/cutlass_backwards.py          | 664 +++---------------
 .../deepseek_v3/cutlass_integration.py        |  52 +-
 .../deepseek_v3/cutlass_test_driver.py        |   1 +
 3 files changed, 102 insertions(+), 615 deletions(-)

diff --git a/torchtitan/experiments/deepseek_v3/cutlass_backwards.py b/torchtitan/experiments/deepseek_v3/cutlass_backwards.py
index ca902c09a..6a84856e1 100644
--- a/torchtitan/experiments/deepseek_v3/cutlass_backwards.py
+++ b/torchtitan/experiments/deepseek_v3/cutlass_backwards.py
@@ -23,96 +23,6 @@
     print(f"✗ Import failed: {e}")
     print("Using PyTorch fallback implementations only")
 
-"""
-Notes!
-
-current error - requires a kernel context before getting hardware info.
-
-This class is used to get the hardware info of given GPU device.
-It provides methods to get the max active clusters for given cluster size.
-
-Prerequisite:
-- CUDA driver is initialized via `driver.cuInit` or other CUDA APIs.
-- CUDA context is created via `driver.cuCtxCreate` or other CUDA APIs.
-
-
-this works:
-cute hardware - device_id 0
-cute hardware - driver_version 12080
-2025-06-12 15:59:27,116 - INFO - Started preprocessing [_host_function]
-2025-06-12 15:59:27,117 - INFO - ASTPreprocessor Transforming function [_host_function]
-2025-06-12 15:59:27,118 - INFO - ASTPreprocessor Executing transformed code for function [_host_function]
-2025-06-12 15:59:27,118 - INFO - Final mangled function name: cutlass__host_function_cutlassutilshardware_infoHardwareInfo_object_at_
-2025-06-12 15:59:27,120 - INFO - Started preprocessing [_empty_kernel]
-2025-06-12 15:59:27,120 - INFO - ASTPreprocessor Transforming function [_empty_kernel]
-2025-06-12 15:59:27,121 - INFO - ASTPreprocessor Executing transformed code for function [_empty_kernel]
-2025-06-12 15:59:27,122 - INFO - Final mangled function name: cutlass__empty_kernel_cutlassutilshardware_infoHardwareInfo_object_at_
-2025-06-12 15:59:27,243 - INFO - Function=[cutlass__host_function_cutlassutilshardware_infoHardwareInfo_object_at_] Computed module_hash=[dcde861f00d587038a517012d015a7fe7d920bf8fd510b4d947c612997627913]
-2025-06-12 15:59:27,243 - INFO - JIT cache miss function=[cutlass__host_function_cutlassutilshardware_infoHardwareInfo_object_at_] module_hash=[dcde861f00d587038a517012d015a7fe7d920bf8fd510b4d947c612997627913]
-2025-06-12 15:59:27,275 - INFO - cuModuleLoadData 478334896
-2025-06-12 15:59:27,276 - INFO - cuModuleGetFunction <CUmodule 0x1c94d2d0> kernel_cutlass__empty_kernel_cutlassutilshardware_infoHardwareInfo_object_at__0
-2025-06-12 15:59:27,276 - INFO - <CUfunction 0x1c910de0> <-- cuModuleGetFunction
-max_dynamic_shared_memory: 232448
-max_active_blocks: 1
-Initialized CUTLASSGroupedGemmStrategy for Blackwell with:
-
-but basic strategy below fails:
-
-✓ CUTLASS and strategies imported successfully
-Testing CUTLASS Backward Group GEMM...
-Creating strategy for 4 experts, 1024 in_features, 2048 out_features, 512 total_tokens
-Initializing CUTLASSGroupedGemmStrategy for Blackwell
-cute hardware - device_id 0
-cute hardware - driver_version 12080
-Traceback (most recent call last):
-  File "/data/users/less/torchtitan/torchtitan/experiments/deepseek_v3/cutlass_backwards.py", line 1387, in <module>
-    test_cutlass_backward_group_gemm()
-  File "/data/users/less/torchtitan/torchtitan/experiments/deepseek_v3/cutlass_backwards.py", line 1348, in test_cutlass_backward_group_gemm
-    strategy = CUTLASSGroupedGemmStrategy(
-               ^^^^^^^^^^^^^^^^^^^^^^^^^^^
-  File "/data/users/less/torchtitan/torchtitan/experiments/deepseek_v3/cutlass_backwards.py", line 120, in __init__
-    self._initialize_hardware()
-  File "/data/users/less/torchtitan/torchtitan/experiments/deepseek_v3/cutlass_backwards.py", line 149, in _initialize_hardware
-    self.max_active_clusters = self.hardware_info.get_max_active_clusters(
-                               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-  File "/home/less/.conda/envs/pycutlass/lib/python3.12/site-packages/nvidia_cutlass_dsl/python_packages/cutlass/base_dsl/jit_executor.py", line 328, in walk_module_and_get_cubin_data
-    module.operation.walk(walk_gpu_binary_op)
-RuntimeError: Exception raised in callback: Traceback (most recent call last):
-  File "/data/users/less/torchtitan/torchtitan/experiments/deepseek_v3/cutlass_backwards.py", line 1387, in <module>
-  File "/data/users/less/torchtitan/torchtitan/experiments/deepseek_v3/cutlass_backwards.py", line 1348, in test_cutlass_backward_group_gemm
-  File "/data/users/less/torchtitan/torchtitan/experiments/deepseek_v3/cutlass_backwards.py", line 120, in __init__
-  File "/data/users/less/torchtitan/torchtitan/experiments/deepseek_v3/cutlass_backwards.py", line 149, in _initialize_hardware
-  File "/home/less/.conda/envs/pycutlass/lib/python3.12/site-packages/nvidia_cutlass_dsl/python_packages/cutlass/utils/hardware_info.py", line 47, in get_max_active_clusters
-  File "/home/less/.conda/envs/pycutlass/lib/python3.12/site-packages/nvidia_cutlass_dsl/python_packages/cutlass/utils/hardware_info.py", line 176, in _get_device_function
-  File "/home/less/.conda/envs/pycutlass/lib/python3.12/site-packages/nvidia_cutlass_dsl/python_packages/cutlass/base_dsl/compiler.py", line 221, in compile
-  File "/home/less/.conda/envs/pycutlass/lib/python3.12/site-packages/nvidia_cutlass_dsl/python_packages/cutlass/base_dsl/dsl.py", line 1337, in _func
-  File "/home/less/.conda/envs/pycutlass/lib/python3.12/site-packages/nvidia_cutlass_dsl/python_packages/cutlass/base_dsl/dsl.py", line 1188, in generate_mlir
-  File "/home/less/.conda/envs/pycutlass/lib/python3.12/site-packages/nvidia_cutlass_dsl/python_packages/cutlass/base_dsl/dsl.py", line 1129, in compile_and_cache
-  File "/home/less/.conda/envs/pycutlass/lib/python3.12/site-packages/nvidia_cutlass_dsl/python_packages/cutlass/base_dsl/jit_executor.py", line 258, in update_jit_cuda_modules
-  File "/home/less/.conda/envs/pycutlass/lib/python3.12/site-packages/nvidia_cutlass_dsl/python_packages/cutlass/base_dsl/jit_executor.py", line 328, in walk_module_and_get_cubin_data
-  File "/home/less/.conda/envs/pycutlass/lib/python3.12/site-packages/nvidia_cutlass_dsl/python_packages/cutlass/base_dsl/jit_executor.py", line 325, in walk_gpu_binary_op
-  File "/home/less/.conda/envs/pycutlass/lib/python3.12/site-packages/nvidia_cutlass_dsl/python_packages/cutlass/base_dsl/jit_executor.py", line 243, in walk_callback
-  File "/home/less/.conda/envs/pycutlass/lib/python3.12/site-packages/nvidia_cutlass_dsl/python_packages/cutlass/base_dsl/runtime/cuda.py", line 294, in load_cubin_module_data
-  File "/home/less/.conda/envs/pycutlass/lib/python3.12/site-packages/nvidia_cutlass_dsl/python_packages/cutlass/base_dsl/runtime/cuda.py", line 229, in checkCudaErrors
-DSLCudaRuntimeError: DSLCudaRuntimeError: Unknown CUDA error
-Error Code: 201
-
-🔍 Additional Context:
-- Error name: CUDA_ERROR_INVALID_CONTEXT
-- CUDA_TOOLKIT_PATH: not set
-- Target SM ARCH: not set
-
-📊 GPU Information:
-- CUDA devices available: 8 (current: 0)
-- Architecture: Blackwell (sm_100a)
-- Compatible SM archs: sm_100a
-
-Compatibility Check:
-❌ Error: Target SM ARCH unknown is not compatible
-💡 Please use one of SM ARCHs: sm_100a
-
-"""
-
 
 # Strategy base class for GroupGEMM implementations
 class GroupGEMMStrategy:
@@ -189,9 +99,9 @@ class CUTLASSGroupedGemmStrategy(GroupGEMMStrategy):
     def __init__(
         self,
         custom_activation=nn.SiLU(),
-        use_2cta_instrs=True,
-        mma_tiler_mn=(256, 128),
-        cluster_shape_mn=(4, 4),
+        use_2cta_instrs=True,  # Changed default to False to avoid context issues
+        mma_tiler_mn=(256, 128),  # Changed default to single-CTA values
+        cluster_shape_mn=(2, 2),  # Changed default to single-CTA values
     ):
         """Initialize the CUTLASS grouped GEMM strategy for Blackwell architecture."""
         print(f"Initializing CUTLASSGroupedGemmStrategy for Blackwell")
@@ -202,9 +112,6 @@ def __init__(
         self.mma_tiler_mn = mma_tiler_mn or self._get_default_mma_tiler()
         self.cluster_shape_mn = cluster_shape_mn or self._get_default_cluster_shape()
 
-        # Validate configurations
-        # self._validate_configurations()
-
         # Initialize kernel and hardware info
         self._initialize_kernel()
         self._initialize_hardware()
@@ -235,9 +142,10 @@ def _initialize_kernel(self):
 
     def _initialize_hardware(self):
         """Initialize hardware information and stream."""
-        # TODO - this is a workaround for dsl cuda context requirement
+        # Force CUDA context creation to avoid DSL errors
         dummy_tensor = torch.zeros(1, device="cuda")
         dummy_tensor.cpu()
+
         self.hardware_info = utils.HardwareInfo()
         self.max_active_clusters = self.hardware_info.get_max_active_clusters(
             self.cluster_shape_mn[0] * self.cluster_shape_mn[1]
@@ -246,119 +154,30 @@ def _initialize_hardware(self):
         torch_stream = torch.cuda.current_stream()
         self.stream = cuda.CUstream(torch_stream.cuda_stream)
 
-    def _validate_configurations(self):
-        """Validate configurations for Blackwell."""
-        self._validate_mma_tiler()
-        self._validate_cluster_shape()
-        self._validate_2cta_constraints()
-
-    def _validate_mma_tiler(self):
-        """Validate MMA tiler configuration."""
-        m_size, n_size = self.mma_tiler_mn
-
-        valid_m_sizes = (
-            self.DUAL_CTA_M_SIZES if self.use_2cta_instrs else self.SINGLE_CTA_M_SIZES
-        )
-        mode_name = "2 CTA" if self.use_2cta_instrs else "single CTA"
-
-        if m_size not in valid_m_sizes:
-            raise ValueError(
-                f"For {mode_name} mode on Blackwell, MMA tiler M must be in {valid_m_sizes}, got {m_size}"
-            )
-
-        if n_size not in self.N_SIZE_RANGE:
-            raise ValueError(
-                f"MMA tiler N must be in range [32, 256] with step 32, got {n_size}"
-            )
-
-    def _validate_cluster_shape(self):
-        """Validate cluster shape configuration."""
-        if self.cluster_shape_mn not in self.SUPPORTED_CLUSTER_SHAPES:
-            raise ValueError(
-                f"Cluster shape {self.cluster_shape_mn} not supported on Blackwell. "
-                f"Valid cluster shapes are: {self.SUPPORTED_CLUSTER_SHAPES}"
-            )
-
-    def _validate_2cta_constraints(self):
-        """Validate 2 CTA specific constraints."""
-        if self.use_2cta_instrs and self.cluster_shape_mn[0] % 2 != 0:
-            valid_2cta_shapes = [
-                shape for shape in self.SUPPORTED_CLUSTER_SHAPES if shape[0] % 2 == 0
-            ]
-            raise ValueError(
-                f"For 2 CTA mode, cluster shape M must be even, got {self.cluster_shape_mn[0]}. "
-                f"Valid 2 CTA cluster shapes: {valid_2cta_shapes}"
-            )
-
     def _log_initialization(self):
         """Log initialization information."""
         cluster_size = self.cluster_shape_mn[0] * self.cluster_shape_mn[1]
+
+        print(f"max_active_blocks: {self.max_active_clusters}")
         print(f"Initialized CUTLASSGroupedGemmStrategy for Blackwell with:")
         print(f"  - 2 CTA instructions: {self.use_2cta_instrs}")
         print(f"  - MMA tiler (M, N): {self.mma_tiler_mn}")
         print(f"  - Cluster shape (M, N): {self.cluster_shape_mn}")
         print(f"  - Cluster size: {cluster_size}")
-        if cluster_size > 1:
-            print(f"  - Using multi-CTA parallelism")
 
     def arrange_expert_weights(self, all_weights, submod_name, module):
         """Store weights in stacked format."""
         return torch.stack(all_weights)
 
     def execute(self, contig_tokens, m_sizes, m_offsets, module):
-        """
-        Execute using CUTLASS grouped GEMM kernel - GPU-only version.
-
-        Args:
-            contig_tokens: Input tokens arranged contiguously by expert
-            m_sizes: Tensor of expert sizes (GPU tensor to avoid sync)
-            m_offsets: Tensor of expert offsets (GPU tensor to avoid sync)
-            module: MoE module containing weights
-        """
-        # Convert to GPU tensors if needed (avoid CPU-GPU sync)
-        m_sizes_gpu, m_offsets_gpu = self._ensure_gpu_tensors(
-            m_sizes, m_offsets, contig_tokens.device
-        )
-
-        # Validate inputs
-        # self._validate_inputs(contig_tokens, m_sizes_gpu, module)
-
-        # Get weights and device
-        weights = self._get_weights(module)
-        device = contig_tokens.device
-
-        # Prepare output tensor
-        output = torch.zeros(
-            contig_tokens.shape[0],
-            weights["gate"].shape[2],
-            dtype=self.DTYPE_TORCH,
-            device=device,
-        )
-
-        # Check for valid experts using GPU operations (no sync)
-        if not self._has_valid_experts_gpu(m_sizes_gpu):
-            return output
-
-        # Execute the three-stage computation using GPU-only operations
-        gate_outputs, up_outputs = self._execute_projections_gpu(
-            contig_tokens,
-            weights["gate"],
-            weights["up"],
-            m_sizes_gpu,
-            m_offsets_gpu,
-            device,
-        )
-
-        hidden_states = self._apply_activation_and_combine(gate_outputs, up_outputs)
-
-        final_outputs = self._execute_down_projection_gpu(
-            hidden_states, weights["down"], m_sizes_gpu, device
-        )
-
-        return self._reconstruct_output_gpu(
-            final_outputs, m_sizes_gpu, m_offsets_gpu, output
+        """Execute using CUTLASS grouped GEMM kernel - GPU-only version."""
+        # This method is used by the MoE strategy, not by the linear layer
+        # For the linear layer, we use CUTLASSBackwardGroupGemm.apply() directly
+        raise NotImplementedError(
+            "This method is for MoE integration, use CUTLASSGroupedLinear instead"
         )
 
+    # All the helper methods for CUTLASS kernel execution
     def _ensure_gpu_tensors(self, m_sizes, m_offsets, device):
         """Ensure sizes and offsets are GPU tensors to avoid CPU-GPU sync."""
         if not isinstance(m_sizes, torch.Tensor):
@@ -373,270 +192,6 @@ def _ensure_gpu_tensors(self, m_sizes, m_offsets, device):
 
         return m_sizes_gpu, m_offsets_gpu
 
-    def _has_valid_experts_gpu(self, m_sizes_gpu):
-        """Check if any experts have tokens using GPU operations (no sync)."""
-        return torch.any(
-            m_sizes_gpu > 0
-        ).item()  # Single sync here is unavoidable for control flow
-
-    def _validate_inputs(self, contig_tokens, m_sizes_gpu, module):
-        """Validate input parameters."""
-        if contig_tokens.dtype != self.DTYPE_TORCH:
-            raise ValueError(
-                f"Expected input dtype {self.DTYPE_TORCH}, got {contig_tokens.dtype}"
-            )
-
-        if len(contig_tokens.shape) != 2:
-            raise ValueError(
-                f"Expected 2D input tensor, got shape {contig_tokens.shape}"
-            )
-
-        required_params = ["gate_proj_weight", "up_proj_weight", "down_proj_weight"]
-        for param in required_params:
-            if not hasattr(module, param) or module.get_parameter(param) is None:
-                raise ValueError(f"Module missing required parameter: {param}")
-
-    def _get_weights(self, module):
-        """Extract and return weight tensors from module."""
-        return {
-            "gate": module.get_parameter("gate_proj_weight"),
-            "up": module.get_parameter("up_proj_weight"),
-            "down": module.get_parameter("down_proj_weight"),
-        }
-
-    def _execute_projections_gpu(
-        self, input_tokens, weight1, weight2, m_sizes_gpu, m_offsets_gpu, device
-    ):
-        """Execute gate and up projections using GPU-only operations."""
-        # Find valid experts using GPU operations
-        valid_mask = m_sizes_gpu > 0
-        valid_indices = torch.nonzero(valid_mask, as_tuple=True)[0]
-
-        if len(valid_indices) == 0:
-            return [], []
-
-        # Prepare metadata in batch using GPU operations
-        problem_sizes, strides_abc, ptrs_abc, gate_outputs, up_outputs = (
-            self._prepare_gate_up_metadata_gpu(
-                input_tokens,
-                weight1,
-                weight2,
-                m_sizes_gpu,
-                m_offsets_gpu,
-                valid_indices,
-                device,
-            )
-        )
-
-        if len(problem_sizes) == 0:
-            return [], []
-
-        # Execute grouped GEMM
-        self._execute_grouped_gemm(problem_sizes, strides_abc, ptrs_abc, device)
-
-        return gate_outputs, up_outputs
-
-    def _prepare_gate_up_metadata_gpu(
-        self,
-        input_tokens,
-        gate_weights,
-        up_weights,
-        m_sizes_gpu,
-        m_offsets_gpu,
-        valid_indices,
-        device,
-    ):
-        """Prepare metadata for gate and up projections"""
-        problem_sizes = []
-        strides_abc = []
-        ptrs_abc = []
-        gate_outputs = []
-        up_outputs = []
-
-        # Extract valid sizes and offsets (minimal sync - only for valid experts)
-        valid_sizes = m_sizes_gpu[valid_indices]
-        valid_offsets = (
-            m_offsets_gpu[valid_indices]
-            if len(m_offsets_gpu) > len(valid_indices)
-            else torch.cumsum(
-                torch.cat([torch.tensor([0], device=device), valid_sizes[:-1]]), dim=0
-            )
-        )
-
-        # Convert to Python for iteration (unavoidable in this test for metadata preparation)
-        valid_sizes_cpu = valid_sizes.cpu().tolist()
-        valid_offsets_cpu = valid_offsets.cpu().tolist()
-        valid_indices_cpu = valid_indices.cpu().tolist()
-
-        for i, (expert_idx, size, offset) in enumerate(
-            zip(valid_indices_cpu, valid_sizes_cpu, valid_offsets_cpu)
-        ):
-            if size > 0:
-                # Get expert data
-                expert_tokens = input_tokens[offset : offset + size].contiguous()
-                gate_weight = gate_weights[expert_idx].contiguous()
-                up_weight = up_weights[expert_idx].contiguous()
-
-                M, K = expert_tokens.shape
-                N = gate_weight.shape[0]
-                L = 1
-
-                # Create output tensors
-                gate_output = torch.empty(M, N, dtype=self.DTYPE_TORCH, device=device)
-                up_output = torch.empty(M, N, dtype=self.DTYPE_TORCH, device=device)
-
-                # Add both projections to metadata
-                for weight, output, output_list in [
-                    (gate_weight, gate_output, gate_outputs),
-                    (up_weight, up_output, up_outputs),
-                ]:
-                    self._add_projection_to_metadata(
-                        expert_tokens,
-                        weight,
-                        output,
-                        problem_sizes,
-                        strides_abc,
-                        ptrs_abc,
-                    )
-                    output_list.append(output)
-
-        return problem_sizes, strides_abc, ptrs_abc, gate_outputs, up_outputs
-
-    def _execute_down_projection_gpu(
-        self, hidden_states, down_weights, m_sizes_gpu, device
-    ):
-        """Execute down projection using GPU operations."""
-        if not hidden_states:
-            return []
-
-        # Find valid experts
-        valid_mask = m_sizes_gpu > 0
-        valid_indices = torch.nonzero(valid_mask, as_tuple=True)[0]
-
-        # Prepare metadata
-        problem_sizes, strides_abc, ptrs_abc, down_outputs = (
-            self._prepare_down_metadata_gpu(
-                hidden_states, down_weights, valid_indices, device
-            )
-        )
-
-        if len(problem_sizes) == 0:
-            return []
-
-        # Execute grouped GEMM
-        self._execute_grouped_gemm(problem_sizes, strides_abc, ptrs_abc, device)
-
-        return down_outputs
-
-    def _prepare_down_metadata_gpu(
-        self, hidden_states, down_weights, valid_indices, device
-    ):
-        """Prepare metadata for down projection using GPU operations."""
-        problem_sizes = []
-        strides_abc = []
-        ptrs_abc = []
-        down_outputs = []
-
-        # Convert indices to CPU for iteration (minimal sync)
-        valid_indices_cpu = valid_indices.cpu().tolist()
-
-        for i, expert_idx in enumerate(valid_indices_cpu):
-            if i < len(hidden_states):
-                hidden = hidden_states[i]
-                down_weight = down_weights[expert_idx].contiguous()
-
-                M, K = hidden.shape
-                N = down_weight.shape[0]
-
-                # Create output tensor
-                down_output = torch.empty(M, N, dtype=self.DTYPE_TORCH, device=device)
-                down_outputs.append(down_output)
-
-                # Add to metadata
-                self._add_projection_to_metadata(
-                    hidden,
-                    down_weight,
-                    down_output,
-                    problem_sizes,
-                    strides_abc,
-                    ptrs_abc,
-                )
-
-        return problem_sizes, strides_abc, ptrs_abc, down_outputs
-
-    def _add_projection_to_metadata(
-        self,
-        input_tensor,
-        weight_tensor,
-        output_tensor,
-        problem_sizes,
-        strides_abc,
-        ptrs_abc,
-    ):
-        """Add a single projection to the metadata lists."""
-        M, K = input_tensor.shape
-        N = weight_tensor.shape[0]
-        L = 1
-
-        # Convert to MNKL format
-        input_mnkl = input_tensor.unsqueeze(-1).contiguous()
-        weight_mnkl = weight_tensor.unsqueeze(-1).contiguous()
-        output_mnkl = output_tensor.unsqueeze(-1).contiguous()
-
-        # Extract strides
-        input_strides = list(input_mnkl.stride()[:2])
-        weight_strides = list(weight_mnkl.stride()[:2])
-        output_strides = list(output_mnkl.stride()[:2])
-
-        # Add to metadata
-        problem_sizes.append([M, N, K, L])
-        strides_abc.append([input_strides, weight_strides, output_strides])
-        ptrs_abc.append(
-            [
-                input_tensor.data_ptr(),
-                weight_tensor.data_ptr(),
-                output_tensor.data_ptr(),
-            ]
-        )
-
-    def _execute_grouped_gemm(self, problem_sizes, strides_abc, ptrs_abc, device):
-        """Execute the grouped GEMM kernel."""
-        num_groups = len(problem_sizes)
-
-        # Convert to CUTE tensors
-        problem_sizes_cute, strides_cute, ptrs_cute = self._convert_to_cute_tensors(
-            problem_sizes, strides_abc, ptrs_abc, device
-        )
-
-        # Get tensormap and compute clusters
-        tensormap_cute = self._get_tensormap_buffer(device)
-        total_clusters = self._compute_total_clusters(problem_sizes)
-
-        # Get initial tensors for compilation
-        initial_tensors = self._create_initial_tensors(problem_sizes[0], device)
-
-        # Compile or retrieve kernel
-        compiled_kernel = self._get_compiled_kernel(
-            num_groups,
-            total_clusters,
-            initial_tensors,
-            problem_sizes_cute,
-            strides_cute,
-            ptrs_cute,
-            tensormap_cute,
-        )
-
-        # Execute kernel
-        compiled_kernel(
-            *initial_tensors,
-            problem_sizes_cute,
-            strides_cute,
-            ptrs_cute,
-            tensormap_cute,
-            self.stream,
-        )
-        torch.cuda.synchronize()
-
     def _convert_to_cute_tensors(self, problem_sizes, strides_abc, ptrs_abc, device):
         """Convert metadata to CUTE tensors."""
         problem_sizes_tensor = torch.tensor(
@@ -748,46 +303,6 @@ def _compute_total_clusters(self, problem_sizes):
 
         return total
 
-    def _apply_activation_and_combine(self, gate_outputs, up_outputs):
-        """Apply activation and combine gate/up outputs."""
-        return [
-            self.activation_function(gate_out) * up_out
-            for gate_out, up_out in zip(gate_outputs, up_outputs)
-        ]
-
-    def _reconstruct_output_gpu(
-        self, final_outputs, m_sizes_gpu, m_offsets_gpu, output
-    ):
-        """Reconstruct the full output tensor using GPU operations (minimal sync)."""
-        if not final_outputs:
-            return output
-
-        # Find valid experts
-        valid_mask = m_sizes_gpu > 0
-        valid_indices = torch.nonzero(valid_mask, as_tuple=True)[0]
-        valid_sizes = m_sizes_gpu[valid_indices]
-
-        # Compute offsets if not provided properly
-        if len(m_offsets_gpu) <= len(valid_indices):
-            valid_offsets = torch.cumsum(
-                torch.cat(
-                    [torch.tensor([0], device=m_sizes_gpu.device), valid_sizes[:-1]]
-                ),
-                dim=0,
-            )
-        else:
-            valid_offsets = m_offsets_gpu[valid_indices]
-
-        # Convert to CPU for final reconstruction (minimal sync)
-        valid_sizes_cpu = valid_sizes.cpu().tolist()
-        valid_offsets_cpu = valid_offsets.cpu().tolist()
-
-        for i, (size, offset) in enumerate(zip(valid_sizes_cpu, valid_offsets_cpu)):
-            if i < len(final_outputs):
-                output[offset : offset + size] = final_outputs[i]
-
-        return output
-
     @staticmethod
     def is_available() -> bool:
         return HAS_CUTLASS
@@ -1048,25 +563,27 @@ def _prepare_forward_metadata(
                 ].contiguous()  # [M, K]
                 expert_weight = weight_stack[
                     expert_idx
-                ].contiguous()  # [N, K] - already transposed
+                ].contiguous()  # [N, K] - already transposed for forward
 
                 M, K = expert_tokens.shape
                 N, K_w = expert_weight.shape
                 assert K == K_w, f"Dimension mismatch: {K} != {K_w}"
-                L = 1
 
                 # Create output tensor
                 output = torch.empty(M, N, dtype=strategy.DTYPE_TORCH, device=device)
                 outputs.append(output)
 
                 # Add to metadata: expert_tokens @ expert_weight^T
+                # For CUTLASS, we need to pass B transposed, so we pass expert_weight as [N,K]
+                # and CUTLASS will compute A @ B^T
                 CUTLASSBackwardGroupGemm._add_gemm_to_metadata(
-                    expert_tokens,
-                    expert_weight,
-                    output,
+                    expert_tokens,  # A: [M, K]
+                    expert_weight,  # B: [N, K] (will compute A @ B^T)
+                    output,  # C: [M, N]
                     problem_sizes,
                     strides_abc,
                     ptrs_abc,
+                    transpose_b=True,  # Indicate B should be transposed
                 )
 
         return problem_sizes, strides_abc, ptrs_abc, outputs
@@ -1112,7 +629,6 @@ def _prepare_input_grad_metadata(
                 M, N = grad_expert.shape
                 N_w, K = expert_weight.shape
                 assert N == N_w, f"Dimension mismatch: {N} != {N_w}"
-                L = 1
 
                 # Create output tensor for gradient
                 grad_input_expert = torch.empty(
@@ -1121,13 +637,15 @@ def _prepare_input_grad_metadata(
                 outputs.append(grad_input_expert)
 
                 # Add to metadata: grad_expert @ expert_weight (no transpose needed)
+                # grad_expert: [M, N], expert_weight: [N, K] -> result: [M, K]
                 CUTLASSBackwardGroupGemm._add_gemm_to_metadata(
-                    grad_expert,
-                    expert_weight,
-                    grad_input_expert,
+                    grad_expert,  # A: [M, N]
+                    expert_weight,  # B: [N, K]
+                    grad_input_expert,  # C: [M, K]
                     problem_sizes,
                     strides_abc,
                     ptrs_abc,
+                    transpose_b=False,  # No transpose needed
                 )
 
         return problem_sizes, strides_abc, ptrs_abc, outputs
@@ -1177,50 +695,75 @@ def _prepare_weight_grad_metadata(
                 M, N = grad_expert.shape
                 M_i, K = input_expert.shape
                 assert M == M_i, f"Dimension mismatch: {M} != {M_i}"
-                L = 1
 
                 # Get output tensor (slice of grad_weight for this expert)
                 grad_weight_expert = grad_weight[expert_idx]  # [N, K]
 
-                # For dW = dY^T @ X, we need to transpose dY
-                # This means we compute: X^T @ dY -> (dY^T @ X)^T = dW^T, then transpose result
-                # Actually, let's compute grad_expert^T @ input_expert directly
-                grad_expert_t = grad_expert.t().contiguous()  # [N, M]
-
-                # Add to metadata: grad_expert_t @ input_expert -> grad_weight_expert
+                # For dW = dY^T @ X, we compute: grad_expert^T @ input_expert -> grad_weight_expert
+                # grad_expert^T: [N, M], input_expert: [M, K] -> result: [N, K]
                 CUTLASSBackwardGroupGemm._add_gemm_to_metadata(
-                    grad_expert_t,
-                    input_expert,
-                    grad_weight_expert,
+                    grad_expert,  # A: [M, N] (will be transposed)
+                    input_expert,  # B: [M, K]
+                    grad_weight_expert,  # C: [N, K]
                     problem_sizes,
                     strides_abc,
                     ptrs_abc,
+                    transpose_a=True,  # Transpose A to get [N, M]
                 )
 
         return problem_sizes, strides_abc, ptrs_abc
 
     @staticmethod
-    def _add_gemm_to_metadata(A, B, C, problem_sizes, strides_abc, ptrs_abc):
+    def _add_gemm_to_metadata(
+        A,
+        B,
+        C,
+        problem_sizes,
+        strides_abc,
+        ptrs_abc,
+        transpose_a=False,
+        transpose_b=False,
+    ):
         """Add a single GEMM operation to metadata lists."""
-        M, K = A.shape
-        # Check if B is [N, K] or [K, N] and handle accordingly
-        if B.shape[1] == K:  # B is [N, K]
-            N, K_b = B.shape
-        else:  # B is [K, N]
-            K_b, N = B.shape
-            # Transpose B for the computation
-            B = B.t().contiguous()
-
-        assert K == K_b, f"Inner dimension mismatch: {K} != {K_b}"
+        # Get original shapes
+        if transpose_a:
+            M, K_A = A.shape[1], A.shape[0]  # A is [K_A, M] but we want [M, K_A]
+        else:
+            M, K_A = A.shape
 
-        B_transposed = B  # .t().contiguous()  # [N, K]
+        if transpose_b:
+            N, K_B = (
+                B.shape[0],
+                B.shape[1],
+            )  # B is [N, K_B] but we want [K_B, N] for B^T
+        else:
+            K_B, N = B.shape
 
+        # Ensure inner dimensions match
+        assert K_A == K_B, f"Inner dimension mismatch: {K_A} != {K_B}"
+        K = K_A
         L = 1
 
-        # Convert to MNKL format
-        A_mnkl = A.unsqueeze(-1).contiguous()  # [M, K, 1]
-        B_mnkl = B_transposed.unsqueeze(-1).contiguous()  # [N, K, 1]
-        C_mnkl = C.unsqueeze(-1).contiguous()  # [M, N, 1]
+        # Create proper tensor views for CUTLASS
+        if transpose_a:
+            # A^T: need to transpose A
+            A_for_gemm = A.t().contiguous()  # [M, K]
+        else:
+            A_for_gemm = A.contiguous()  # [M, K]
+
+        if transpose_b:
+            # B^T: B is already [N, K], CUTLASS will handle the transpose
+            B_for_gemm = B.contiguous()  # [N, K]
+        else:
+            # B: need to transpose to [N, K] format expected by CUTLASS
+            B_for_gemm = B.t().contiguous()  # [N, K]
+
+        C_for_gemm = C.contiguous()  # [M, N]
+
+        # Convert to MNKL format for CUTLASS
+        A_mnkl = A_for_gemm.unsqueeze(-1).contiguous()  # [M, K, 1]
+        B_mnkl = B_for_gemm.unsqueeze(-1).contiguous()  # [N, K, 1]
+        C_mnkl = C_for_gemm.unsqueeze(-1).contiguous()  # [M, N, 1]
 
         # Extract strides
         A_strides = list(A_mnkl.stride()[:2])
@@ -1230,7 +773,9 @@ def _add_gemm_to_metadata(A, B, C, problem_sizes, strides_abc, ptrs_abc):
         # Add to metadata
         problem_sizes.append([M, N, K, L])
         strides_abc.append([A_strides, B_strides, C_strides])
-        ptrs_abc.append([A.data_ptr(), B_transposed.data_ptr(), C.data_ptr()])
+        ptrs_abc.append(
+            [A_for_gemm.data_ptr(), B_for_gemm.data_ptr(), C_for_gemm.data_ptr()]
+        )
 
     @staticmethod
     def _execute_cutlass_kernel(problem_sizes, strides_abc, ptrs_abc, device, strategy):
@@ -1352,11 +897,6 @@ def __init__(
         """
         super().__init__()
 
-        if bias:
-            raise NotImplementedError(
-                "Bias not yet implemented for CUTLASS grouped linear"
-            )
-
         self.num_experts = num_experts
         self.in_features = in_features
         self.out_features = out_features
@@ -1437,41 +977,6 @@ def extra_repr(self) -> str:
         return f"num_experts={self.num_experts}, in_features={self.in_features}, out_features={self.out_features}"
 
 
-def _initialize_hardware_test(self):
-    """Initialize hardware information and stream."""
-    # Force CUDA context creation by performing a simple operation
-    # This ensures the context exists before HardwareInfo queries it
-    dummy_tensor = torch.zeros(1, device="cuda")
-    dummy_tensor.cpu()  # Force synchronization to establish context
-
-    # Now it's safe to create HardwareInfo
-    hardware_info = utils.HardwareInfo()
-    max_active_clusters = self.hardware_info.get_max_active_clusters(
-        self.cluster_shape_mn[0] * self.cluster_shape_mn[1]
-    )
-
-    torch_stream = torch.cuda.current_stream()
-    self.stream = cuda.CUstream(torch_stream.cuda_stream)
-
-
-def _debug_cuda_context(self):
-    """Debug CUDA context state."""
-    try:
-        import cuda.bindings.driver as cuda_driver
-
-        try:
-            current_context = cuda_driver.cuCtxGetCurrent()
-            print(f"CUDA context exists: {current_context is not None}")
-        except:
-            print("No CUDA context found")
-
-        print(f"PyTorch CUDA available: {torch.cuda.is_available()}")
-        print(f"PyTorch CUDA initialized: {torch.cuda.is_initialized()}")
-
-    except Exception as e:
-        print(f"Debug failed: {e}")
-
-
 # Example usage and testing functions
 def test_cutlass_backward_group_gemm():
     """Test the CUTLASS backward group GEMM implementation."""
@@ -1489,15 +994,16 @@ def test_cutlass_backward_group_gemm():
     print(
         f"Creating strategy for {num_experts} experts, {in_features} in_features, {out_features} out_features, {total_tokens} total_tokens"
     )
-    # Create strategy (assuming it's available)
 
+    # Create strategy with safe defaults that avoid context issues
     strategy = CUTLASSGroupedGemmStrategy(
         custom_activation=lambda x: x,  # Identity for testing
-        use_2cta_instrs=True,
-        mma_tiler_mn=(256, 128),
-        cluster_shape_mn=(2, 2),
+        use_2cta_instrs=True,  # Use single CTA to avoid context issues
+        mma_tiler_mn=(256, 128),  # Safe single-CTA values
+        cluster_shape_mn=(2, 2),  # Safe single-CTA values
     )
     print(f"Using strategy: {strategy}")
+
     # Create test data
     input_tokens = torch.randn(
         total_tokens, in_features, dtype=dtype, device=device, requires_grad=True
@@ -1510,6 +1016,7 @@ def test_cutlass_backward_group_gemm():
         in_features=in_features,
         out_features=out_features,
         strategy=strategy,
+        bias=False,
         dtype=dtype,
     )
 
@@ -1527,6 +1034,7 @@ def test_cutlass_backward_group_gemm():
     assert layer.weight.grad is not None, "Weight gradient should not be None"
 
     print("✓ CUTLASS Backward Group GEMM test passed!")
+    return True
 
 
 if __name__ == "__main__":
diff --git a/torchtitan/experiments/deepseek_v3/cutlass_integration.py b/torchtitan/experiments/deepseek_v3/cutlass_integration.py
index fc2912193..896ca0569 100644
--- a/torchtitan/experiments/deepseek_v3/cutlass_integration.py
+++ b/torchtitan/experiments/deepseek_v3/cutlass_integration.py
@@ -9,61 +9,37 @@
 cute hardware - driver_version 12080
 max_dynamic_shared_memory: 232448
 max_active_blocks: 1
+max_active_blocks: 33
 Initialized CUTLASSGroupedGemmStrategy for Blackwell with:
-  - 2 CTA instructions: False
-  - MMA tiler (M, N): (128, 128)
-  - Cluster shape (M, N): (1, 1)
-  - Cluster size: 1
+  - 2 CTA instructions: True
+  - MMA tiler (M, N): (256, 128)
+  - Cluster shape (M, N): (2, 2)
+  - Cluster size: 4
 🔍 Testing Forward Pass...
 max_dynamic_shared_memory: 232448
 max_active_blocks: 1
-Compiling CUTLASS grouped GEMM kernel: 8 groups, 2CTA=False, cluster=(1, 1)
+Compiling CUTLASS grouped GEMM kernel: 8 groups, 2CTA=True, cluster=(2, 2)
 Kernel compilation successful
    Forward max difference: 0.00e+00
    Forward outputs close: ✓
 🔍 Testing Backward Pass...
-Compiling CUTLASS grouped GEMM kernel: 8 groups, 2CTA=False, cluster=(1, 1)
+Compiling CUTLASS grouped GEMM kernel: 8 groups, 2CTA=True, cluster=(2, 2)
 Kernel compilation successful
-Compiling CUTLASS grouped GEMM kernel: 8 groups, 2CTA=False, cluster=(1, 1)
+Compiling CUTLASS grouped GEMM kernel: 8 groups, 2CTA=True, cluster=(2, 2)
 Kernel compilation successful
-   Input grad max difference: 2.21e+02
+   Input grad max difference: 2.34e+02
    Input gradients close: ❌
-   Weight grad max difference: 8.55e+01
+   Weight grad max difference: 9.10e+01
    Weight gradients close: ❌
 
-🎯 Overall Result: ❌ FAIL
 
-🚀 Benchmarking CUTLASS vs Manual Implementation
-============================================================
-
-📊 Medium: 8 experts, 1024 tokens
-Initializing CUTLASSGroupedGemmStrategy for Blackwell
-cute hardware - device_id 0
-cute hardware - driver_version 12080
-max_dynamic_shared_memory: 232448
-max_active_blocks: 1
-Initialized CUTLASSGroupedGemmStrategy for Blackwell with:
-  - 2 CTA instructions: False
-  - MMA tiler (M, N): (128, 128)
-  - Cluster shape (M, N): (1, 1)
-  - Cluster size: 1
-Traceback (most recent call last):
-  File "/data/users/less/torchtitan/torchtitan/experiments/deepseek_v3/cutlass_integration.py", line 319, in <module>
-    main()
-  File "/data/users/less/torchtitan/torchtitan/experiments/deepseek_v3/cutlass_integration.py", line 309, in main
-    benchmark_cutlass_vs_manual()
-  File "/data/users/less/torchtitan/torchtitan/experiments/deepseek_v3/cutlass_integration.py", line 211, in benchmark_cutlass_vs_manual
-    cutlass_layer = CUTLASSGroupedLinear(
-                    ^^^^^^^^^^^^^^^^^^^^^
-  File "/data/users/less/torchtitan/torchtitan/experiments/deepseek_v3/cutlass_backwards.py", line 1356, in __init__
-    raise NotImplementedError(
-NotImplementedError: Bias not yet implemented for CUTLASS grouped linear
 """
 
 import os
 import sys
 
 import torch
+import torch.nn as nn
 
 
 try:
@@ -90,14 +66,14 @@
 
 
 def create_cutlass_strategy(
-    use_2cta_instrs=False, mma_tiler_mn=(128, 128), cluster_shape_mn=(1, 1)
+    use_2cta_instrs=True, mma_tiler_mn=(256, 128), cluster_shape_mn=(2, 2)
 ):
     """Create a CUTLASS strategy with specified configuration."""
     if not CUTLASS_AVAILABLE:
         raise RuntimeError("CUTLASS not available")
 
     strategy = CUTLASSGroupedGemmStrategy(
-        custom_activation=lambda x: x,  # Identity for linear layers
+        custom_activation=nn.SiLU(),  # Identity for linear layers
         use_2cta_instrs=use_2cta_instrs,
         mma_tiler_mn=mma_tiler_mn,
         cluster_shape_mn=cluster_shape_mn,
@@ -134,6 +110,7 @@ def test_cutlass_vs_manual():
             in_features=in_features,
             out_features=out_features,
             strategy=strategy,
+            bias=False,
             dtype=dtype,
         ).to(device)
     else:
@@ -144,6 +121,7 @@ def test_cutlass_vs_manual():
         num_experts=num_experts,
         in_features=in_features,
         out_features=out_features,
+        bias=False,
         dtype=dtype,
     ).to(device)
 
diff --git a/torchtitan/experiments/deepseek_v3/cutlass_test_driver.py b/torchtitan/experiments/deepseek_v3/cutlass_test_driver.py
index fd2b97fda..18663a683 100644
--- a/torchtitan/experiments/deepseek_v3/cutlass_test_driver.py
+++ b/torchtitan/experiments/deepseek_v3/cutlass_test_driver.py
@@ -119,6 +119,7 @@ def __init__(
         num_experts: int,
         in_features: int,
         out_features: int,
+        bias: bool = False,
         dtype: torch.dtype = torch.bfloat16,
     ):
         super().__init__()

From 74257fb5d2f964eaae9721b7767a0f3ac2814ee5 Mon Sep 17 00:00:00 2001
From: lessw2020 <lessw@etrillium.com>
Date: Sat, 14 Jun 2025 10:21:16 -0700
Subject: [PATCH 16/34] standalone gg for backwards debugging

---
 .../deepseek_v3/cutlass_backwards.py          | 1121 ++++++++++++++++-
 .../deepseek_v3/dsl_back_standalone.py        |  669 ++++++++++
 .../deepseek_v3/simple_debug_back.py          |  332 +++++
 3 files changed, 2119 insertions(+), 3 deletions(-)
 create mode 100644 torchtitan/experiments/deepseek_v3/dsl_back_standalone.py
 create mode 100644 torchtitan/experiments/deepseek_v3/simple_debug_back.py

diff --git a/torchtitan/experiments/deepseek_v3/cutlass_backwards.py b/torchtitan/experiments/deepseek_v3/cutlass_backwards.py
index 6a84856e1..3b31dc3cb 100644
--- a/torchtitan/experiments/deepseek_v3/cutlass_backwards.py
+++ b/torchtitan/experiments/deepseek_v3/cutlass_backwards.py
@@ -99,9 +99,9 @@ class CUTLASSGroupedGemmStrategy(GroupGEMMStrategy):
     def __init__(
         self,
         custom_activation=nn.SiLU(),
-        use_2cta_instrs=True,  # Changed default to False to avoid context issues
+        use_2cta_instrs=False,  # Changed default to False to avoid context issues
         mma_tiler_mn=(256, 128),  # Changed default to single-CTA values
-        cluster_shape_mn=(2, 2),  # Changed default to single-CTA values
+        cluster_shape_mn=(1, 1),  # Changed default to single-CTA values
     ):
         """Initialize the CUTLASS grouped GEMM strategy for Blackwell architecture."""
         print(f"Initializing CUTLASSGroupedGemmStrategy for Blackwell")
@@ -309,6 +309,601 @@ def is_available() -> bool:
 
 
 class CUTLASSBackwardGroupGemm(torch.autograd.Function):
+    """
+    Performance-optimized CUTLASS grouped GEMM with backward pass support.
+
+    Key optimizations:
+    1. Eliminated unnecessary transpositions using CUTLASS layout system
+    2. Fused input and weight gradient computations
+    3. Reduced memory allocations and copies
+    4. Optimized memory access patterns
+    5. Minimized CPU-GPU synchronization
+    """
+
+    @staticmethod
+    def forward(ctx, input_tokens, weight_stack, m_sizes, m_offsets, strategy):
+        """Forward pass: Y_i = X_i @ W_i^T"""
+        ctx.save_for_backward(input_tokens, weight_stack, m_sizes, m_offsets)
+        ctx.strategy = strategy
+
+        # Pre-allocate and reuse output tensor
+        device = input_tokens.device
+        total_tokens, in_features = input_tokens.shape
+        num_experts, out_features, _ = weight_stack.shape
+
+        output = torch.zeros(
+            total_tokens, out_features, dtype=strategy.DTYPE_TORCH, device=device
+        )
+
+        if not torch.any(m_sizes > 0):
+            return output
+
+        return CUTLASSBackwardGroupGemm._execute_grouped_gemm_forward(
+            input_tokens, weight_stack, m_sizes, m_offsets, output, strategy
+        )
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        """Optimized backward pass with fused gradient computations"""
+        input_tokens, weight_stack, m_sizes, m_offsets = ctx.saved_tensors
+        strategy = ctx.strategy
+
+        grad_output = grad_output.contiguous()
+        device = grad_output.device
+
+        # Pre-allocate gradient tensors
+        grad_input = torch.zeros_like(input_tokens)
+        grad_weight = torch.zeros_like(weight_stack)
+
+        if not torch.any(m_sizes > 0):
+            return grad_input, grad_weight, None, None, None
+
+        # OPTIMIZATION 1: Fused backward computation
+        # Compute both input and weight gradients in a single pass
+        CUTLASSBackwardGroupGemm._execute_fused_backward_gemm(
+            grad_output,
+            input_tokens,
+            weight_stack,
+            m_sizes,
+            m_offsets,
+            grad_input,
+            grad_weight,
+            strategy,
+        )
+
+        return grad_input, grad_weight, None, None, None
+
+    @staticmethod
+    def _execute_grouped_gemm_forward(
+        input_tokens, weight_stack, m_sizes, m_offsets, output, strategy
+    ):
+        """Optimized forward execution with minimal overhead"""
+        valid_mask = m_sizes > 0
+        valid_indices = torch.nonzero(valid_mask, as_tuple=True)[0]
+
+        if len(valid_indices) == 0:
+            return output
+
+        # OPTIMIZATION 2: Batch metadata preparation to reduce overhead
+        problem_sizes, strides_abc, ptrs_abc = (
+            CUTLASSBackwardGroupGemm._prepare_batched_forward_metadata(
+                input_tokens,
+                weight_stack,
+                m_sizes,
+                m_offsets,
+                valid_indices,
+                output,
+                strategy,
+            )
+        )
+
+        if len(problem_sizes) == 0:
+            return output
+
+        # OPTIMIZATION 3: Single kernel launch for all experts
+        CUTLASSBackwardGroupGemm._execute_optimized_cutlass_kernel(
+            problem_sizes, strides_abc, ptrs_abc, output.device, strategy
+        )
+
+        return output
+
+    @staticmethod
+    def _execute_fused_backward_gemm(
+        grad_output,
+        input_tokens,
+        weight_stack,
+        m_sizes,
+        m_offsets,
+        grad_input,
+        grad_weight,
+        strategy,
+    ):
+        """
+        OPTIMIZATION 4: Fused backward computation
+
+        Instead of separate kernels for input/weight gradients, we:
+        1. Use CUTLASS's native layout system to avoid transpositions
+        2. Leverage memory locality between input/weight gradient computations
+        3. Minimize kernel launch overhead
+        """
+        valid_mask = m_sizes > 0
+        valid_indices = torch.nonzero(valid_mask, as_tuple=True)[0]
+
+        if len(valid_indices) == 0:
+            return
+
+        # OPTIMIZATION 5: Use CUTLASS LayoutRight/LayoutLeft to avoid transpositions
+        # Prepare both input and weight gradient operations together
+        input_grad_problems, weight_grad_problems = (
+            CUTLASSBackwardGroupGemm._prepare_fused_backward_metadata(
+                grad_output,
+                input_tokens,
+                weight_stack,
+                m_sizes,
+                m_offsets,
+                valid_indices,
+                grad_input,
+                grad_weight,
+                strategy,
+            )
+        )
+
+        # Execute input gradient kernel (if needed)
+        if input_grad_problems[0]:  # problem_sizes
+            CUTLASSBackwardGroupGemm._execute_optimized_cutlass_kernel(
+                *input_grad_problems, grad_input.device, strategy
+            )
+
+        # Execute weight gradient kernel (leveraging warm caches)
+        if weight_grad_problems[0]:  # problem_sizes
+            CUTLASSBackwardGroupGemm._execute_optimized_cutlass_kernel(
+                *weight_grad_problems, grad_weight.device, strategy
+            )
+
+    @staticmethod
+    def _prepare_batched_forward_metadata(
+        input_tokens, weight_stack, m_sizes, m_offsets, valid_indices, output, strategy
+    ):
+        """
+        OPTIMIZATION 6: Optimized metadata preparation
+
+        - Minimize CPU-GPU synchronization
+        - Use vectorized operations where possible
+        - Pre-allocate arrays to avoid repeated allocations
+        """
+        device = input_tokens.device
+        num_valid = len(valid_indices)
+
+        # Pre-allocate metadata arrays
+        problem_sizes = []
+        strides_abc = []
+        ptrs_abc = []
+
+        # OPTIMIZATION 7: Vectorized size/offset computation
+        valid_sizes = m_sizes[valid_indices]
+        if len(m_offsets) > len(valid_indices):
+            valid_offsets = m_offsets[valid_indices]
+        else:
+            valid_offsets = torch.cumsum(
+                torch.cat([torch.tensor([0], device=device), valid_sizes[:-1]]), dim=0
+            )
+
+        # Single CPU-GPU sync for all sizes/offsets
+        valid_sizes_cpu = valid_sizes.cpu().tolist()
+        valid_offsets_cpu = valid_offsets.cpu().tolist()
+        valid_indices_cpu = valid_indices.cpu().tolist()
+
+        # Batch process all experts
+        for expert_idx, size, offset in zip(
+            valid_indices_cpu, valid_sizes_cpu, valid_offsets_cpu
+        ):
+            if size > 0:
+                # Direct pointer arithmetic for contiguous access
+                input_ptr = (
+                    input_tokens.data_ptr()
+                    + offset * input_tokens.stride(0) * input_tokens.element_size()
+                )
+                weight_ptr = weight_stack[expert_idx].data_ptr()
+                output_ptr = (
+                    output.data_ptr()
+                    + offset * output.stride(0) * output.element_size()
+                )
+
+                # OPTIMIZATION 8: Pre-computed strides to avoid repeated calculations
+                in_features = input_tokens.shape[1]
+                out_features = weight_stack.shape[1]
+
+                M, K, N, L = size, in_features, out_features, 1
+
+                # Optimized stride calculations
+                A_strides = [input_tokens.stride(0), input_tokens.stride(1)]
+                B_strides = [
+                    weight_stack.stride(1),
+                    weight_stack.stride(2),
+                ]  # [N, K] layout
+                C_strides = [output.stride(0), output.stride(1)]
+
+                problem_sizes.append([M, N, K, L])
+                strides_abc.append([A_strides, B_strides, C_strides])
+                ptrs_abc.append([input_ptr, weight_ptr, output_ptr])
+
+        return problem_sizes, strides_abc, ptrs_abc
+
+    @staticmethod
+    def _prepare_fused_backward_metadata(
+        grad_output,
+        input_tokens,
+        weight_stack,
+        m_sizes,
+        m_offsets,
+        valid_indices,
+        grad_input,
+        grad_weight,
+        strategy,
+    ):
+        """
+        OPTIMIZATION 9: Fused backward metadata preparation
+
+        Prepare both input and weight gradient operations simultaneously to:
+        - Minimize metadata preparation overhead
+        - Leverage shared computations
+        - Optimize memory access patterns
+        """
+        device = grad_output.device
+
+        # Shared offset/size computations
+        valid_sizes = m_sizes[valid_indices]
+        if len(m_offsets) > len(valid_indices):
+            valid_offsets = m_offsets[valid_indices]
+        else:
+            valid_offsets = torch.cumsum(
+                torch.cat([torch.tensor([0], device=device), valid_sizes[:-1]]), dim=0
+            )
+
+        # Single sync for all metadata
+        valid_sizes_cpu = valid_sizes.cpu().tolist()
+        valid_offsets_cpu = valid_offsets.cpu().tolist()
+        valid_indices_cpu = valid_indices.cpu().tolist()
+
+        # Prepare input gradient metadata (dX = dY @ W)
+        input_grad_problems = CUTLASSBackwardGroupGemm._prepare_input_grad_optimized(
+            grad_output,
+            weight_stack,
+            valid_indices_cpu,
+            valid_sizes_cpu,
+            valid_offsets_cpu,
+            grad_input,
+            strategy,
+        )
+
+        # Prepare weight gradient metadata (dW = dY^T @ X)
+        weight_grad_problems = CUTLASSBackwardGroupGemm._prepare_weight_grad_optimized(
+            grad_output,
+            input_tokens,
+            valid_indices_cpu,
+            valid_sizes_cpu,
+            valid_offsets_cpu,
+            grad_weight,
+            strategy,
+        )
+
+        return input_grad_problems, weight_grad_problems
+
+    @staticmethod
+    def _prepare_input_grad_optimized(
+        grad_output,
+        weight_stack,
+        valid_indices_cpu,
+        valid_sizes_cpu,
+        valid_offsets_cpu,
+        grad_input,
+        strategy,
+    ):
+        """
+        OPTIMIZATION 10: Use CUTLASS LayoutLeft to compute dY @ W without transposition
+
+        Instead of reformulating as dX^T = W^T @ dY^T, we use CUTLASS's layout system
+        to directly compute dY @ W by treating W as LayoutLeft.
+        """
+        problem_sizes = []
+        strides_abc = []
+        ptrs_abc = []
+
+        for expert_idx, size, offset in zip(
+            valid_indices_cpu, valid_sizes_cpu, valid_offsets_cpu
+        ):
+            if size > 0:
+                # Direct computation: dY @ W where dY:[M,N], W:[N,K] -> dX:[M,K]
+                grad_ptr = (
+                    grad_output.data_ptr()
+                    + offset * grad_output.stride(0) * grad_output.element_size()
+                )
+                weight_ptr = weight_stack[expert_idx].data_ptr()
+                grad_input_ptr = (
+                    grad_input.data_ptr()
+                    + offset * grad_input.stride(0) * grad_input.element_size()
+                )
+
+                M, N = size, grad_output.shape[1]
+                K = weight_stack.shape[2]
+                L = 1
+
+                # OPTIMIZATION 11: Use optimal CUTLASS layout to avoid transpose
+                # Configure strides for LayoutLeft on B matrix to get A @ B instead of A @ B^T
+                A_strides = [grad_output.stride(0), grad_output.stride(1)]
+                B_strides = [
+                    weight_stack.stride(2),
+                    weight_stack.stride(1),
+                ]  # Swap strides for transpose effect
+                C_strides = [grad_input.stride(0), grad_input.stride(1)]
+
+                problem_sizes.append([M, K, N, L])  # Note: N and K swapped for layout
+                strides_abc.append([A_strides, B_strides, C_strides])
+                ptrs_abc.append([grad_ptr, weight_ptr, grad_input_ptr])
+
+        return problem_sizes, strides_abc, ptrs_abc
+
+    @staticmethod
+    def _prepare_weight_grad_optimized(
+        grad_output,
+        input_tokens,
+        valid_indices_cpu,
+        valid_sizes_cpu,
+        valid_offsets_cpu,
+        grad_weight,
+        strategy,
+    ):
+        """
+        OPTIMIZATION 12: Direct weight gradient computation using stride manipulation
+
+        Compute dW = dY^T @ X directly by using appropriate stride configurations
+        instead of creating transposed tensors.
+        """
+        problem_sizes = []
+        strides_abc = []
+        ptrs_abc = []
+
+        for expert_idx, size, offset in zip(
+            valid_indices_cpu, valid_sizes_cpu, valid_offsets_cpu
+        ):
+            if size > 0:
+                # Direct computation: dY^T @ X where dY:[M,N], X:[M,K] -> dW:[N,K]
+                grad_ptr = (
+                    grad_output.data_ptr()
+                    + offset * grad_output.stride(0) * grad_output.element_size()
+                )
+                input_ptr = (
+                    input_tokens.data_ptr()
+                    + offset * input_tokens.stride(0) * input_tokens.element_size()
+                )
+                weight_grad_ptr = grad_weight[expert_idx].data_ptr()
+
+                M, N = size, grad_output.shape[1]
+                K = input_tokens.shape[1]
+                L = 1
+
+                # OPTIMIZATION 13: Stride configuration for dY^T @ X computation
+                # Treat dY as transposed by swapping its strides
+                A_strides = [
+                    grad_output.stride(1),
+                    grad_output.stride(0),
+                ]  # Transposed dY strides
+                B_strides = [
+                    input_tokens.stride(1),
+                    input_tokens.stride(0),
+                ]  # X strides for B^T
+                C_strides = [grad_weight.stride(1), grad_weight.stride(2)]
+
+                problem_sizes.append([N, K, M, L])
+                strides_abc.append([A_strides, B_strides, C_strides])
+                ptrs_abc.append([grad_ptr, input_ptr, weight_grad_ptr])
+
+        return problem_sizes, strides_abc, ptrs_abc
+
+    @staticmethod
+    def _execute_optimized_cutlass_kernel(
+        problem_sizes, strides_abc, ptrs_abc, device, strategy
+    ):
+        """
+        OPTIMIZATION 14: Optimized CUTLASS kernel execution
+
+        - Reuse compiled kernels aggressively
+        - Minimize tensor creation overhead
+        - Use optimal cluster configurations
+        """
+        if not problem_sizes:
+            return
+
+        num_groups = len(problem_sizes)
+
+        # OPTIMIZATION 15: Reuse tensor allocations using memory pool
+        if not hasattr(strategy, "_tensor_pool"):
+            strategy._tensor_pool = {}
+
+        # Create metadata tensors with memory reuse
+        cache_key = (num_groups, device)
+        if cache_key not in strategy._tensor_pool:
+            strategy._tensor_pool[cache_key] = {
+                "problem_sizes": torch.empty(
+                    num_groups, 4, dtype=torch.int32, device=device
+                ),
+                "strides": torch.empty(
+                    num_groups, 3, 2, dtype=torch.int32, device=device
+                ),
+                "ptrs": torch.empty(num_groups, 3, dtype=torch.int64, device=device),
+            }
+
+        tensors = strategy._tensor_pool[cache_key]
+
+        # Fill tensors directly to avoid allocations
+        tensors["problem_sizes"][: len(problem_sizes)] = torch.tensor(
+            problem_sizes, device=device
+        )
+        tensors["strides"][: len(strides_abc)] = torch.tensor(
+            strides_abc, device=device
+        )
+        tensors["ptrs"][: len(ptrs_abc)] = torch.tensor(ptrs_abc, device=device)
+
+        # Convert to CUTE tensors
+        problem_sizes_cute = from_dlpack(
+            tensors["problem_sizes"][: len(problem_sizes)],
+            assumed_align=strategy.ALIGNMENT,
+        )
+        strides_cute = from_dlpack(
+            tensors["strides"][: len(strides_abc)], assumed_align=strategy.ALIGNMENT
+        )
+        ptrs_cute = from_dlpack(
+            tensors["ptrs"][: len(ptrs_abc)], assumed_align=strategy.ALIGNMENT
+        )
+
+        # OPTIMIZATION 16: Aggressive kernel caching with finer granularity
+        total_clusters = strategy._compute_total_clusters(problem_sizes)
+        cache_key = (
+            num_groups,
+            total_clusters,
+            tuple(problem_sizes[0][:3]),
+        )  # Include problem shape
+
+        if cache_key not in strategy._compiled_kernels:
+            tensormap_cute = strategy._get_tensormap_buffer(device)
+            initial_tensors = strategy._create_initial_tensors(problem_sizes[0], device)
+
+            strategy._compiled_kernels[cache_key] = cute.compile(
+                strategy.grouped_gemm,
+                *initial_tensors,
+                num_groups,
+                problem_sizes_cute,
+                strides_cute,
+                ptrs_cute,
+                total_clusters,
+                tensormap_cute,
+                strategy.max_active_clusters,
+                strategy.stream,
+            )
+
+        # Execute with cached kernel
+        compiled_kernel = strategy._compiled_kernels[cache_key]
+        tensormap_cute = strategy._get_tensormap_buffer(device)
+        initial_tensors = strategy._create_initial_tensors(problem_sizes[0], device)
+
+        compiled_kernel(
+            *initial_tensors,
+            problem_sizes_cute,
+            strides_cute,
+            ptrs_cute,
+            tensormap_cute,
+            strategy.stream,
+        )
+
+        # OPTIMIZATION 17: Asynchronous execution - don't sync unless needed
+        # torch.cuda.synchronize()  # Only sync when absolutely necessary
+
+
+# OPTIMIZATION 18: Memory-efficient strategy configuration
+class CUTLASSGroupedGemmStrategyOptimized(CUTLASSGroupedGemmStrategy):
+    """
+    Performance-optimized strategy with additional caching and memory management.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # OPTIMIZATION 19: Pre-allocate commonly used tensor shapes
+        self._tensor_pool = {}
+        self._kernel_cache_hits = 0
+        self._kernel_cache_misses = 0
+
+        # OPTIMIZATION 20: Tune cluster configuration for backward passes
+        self._optimize_cluster_config()
+
+    def _optimize_cluster_config(self):
+        """
+        OPTIMIZATION 21: Dynamically tune cluster configuration
+
+        Backward passes have different compute/memory patterns than forward,
+        so we optimize cluster shapes accordingly.
+        """
+        # Smaller clusters often work better for gradient computations
+        # due to different memory access patterns
+        if self.cluster_shape_mn == (4, 4):
+            self.cluster_shape_mn = (2, 2)  # Better for gradient computations
+        elif self.cluster_shape_mn == (2, 2):
+            self.cluster_shape_mn = (1, 2)  # Even more conservative
+
+    def get_cache_stats(self):
+        """Get kernel cache performance statistics"""
+        total = self._kernel_cache_hits + self._kernel_cache_misses
+        hit_rate = self._kernel_cache_hits / total if total > 0 else 0
+        return {
+            "cache_hits": self._kernel_cache_hits,
+            "cache_misses": self._kernel_cache_misses,
+            "hit_rate": hit_rate,
+        }
+
+
+# OPTIMIZATION 22: High-level optimized linear layer
+class CUTLASSGroupedLinearOptimized(nn.Module):
+    """
+    Optimized CUTLASS grouped linear layer with performance enhancements.
+    """
+
+    def __init__(self, num_experts, in_features, out_features, strategy, **kwargs):
+        super().__init__()
+        self.num_experts = num_experts
+        self.in_features = in_features
+        self.out_features = out_features
+        self.strategy = strategy
+        self.dtype = kwargs.get("dtype", torch.bfloat16)
+
+        # OPTIMIZATION 23: Use optimized parameter initialization
+        self.weight = nn.Parameter(
+            torch.empty(
+                num_experts, out_features, in_features, dtype=self.dtype, device="cuda"
+            )  # Pre-allocate on GPU
+        )
+        self.reset_parameters()
+
+        # OPTIMIZATION 24: Pre-compute commonly used tensors
+        self._size_tensor_cache = {}
+
+    def forward(self, input_tokens, expert_assignments):
+        """Optimized forward pass with caching"""
+        # OPTIMIZATION 25: Cache size/offset computations for repeated patterns
+        assignment_hash = hash(expert_assignments.data_ptr())
+        if assignment_hash in self._size_tensor_cache:
+            m_sizes, m_offsets = self._size_tensor_cache[assignment_hash]
+        else:
+            m_sizes, m_offsets = self._compute_expert_sizes_and_offsets(
+                expert_assignments
+            )
+            if len(self._size_tensor_cache) < 100:  # Limit cache size
+                self._size_tensor_cache[assignment_hash] = (m_sizes, m_offsets)
+
+        # OPTIMIZATION 26: Skip sorting if already sorted (common in some workloads)
+        if torch.all(expert_assignments[:-1] <= expert_assignments[1:]):
+            sorted_tokens = input_tokens
+            sorted_indices = torch.arange(len(input_tokens), device=input_tokens.device)
+        else:
+            sorted_indices = torch.argsort(expert_assignments)
+            sorted_tokens = input_tokens[sorted_indices]
+
+        # Use optimized backward function
+        sorted_output = CUTLASSBackwardGroupGemmOptimized.apply(
+            sorted_tokens, self.weight, m_sizes, m_offsets, self.strategy
+        )
+
+        # OPTIMIZATION 27: Avoid unnecessary tensor creation for unsort
+        if torch.equal(
+            sorted_indices, torch.arange(len(input_tokens), device=input_tokens.device)
+        ):
+            return sorted_output
+        else:
+            output = torch.empty_like(sorted_output)
+            output[sorted_indices] = sorted_output
+            return output
+
+
+class CUTLASSBackwardGroupGemm_prev(torch.autograd.Function):
     """
     PyTorch autograd Function for CUTLASS grouped GEMM with backward pass support.
 
@@ -1037,5 +1632,525 @@ def test_cutlass_backward_group_gemm():
     return True
 
 
+def _test_single_expert_cutlass_fixed(
+    grad_expert, input_expert, weight_expert, strategy
+):
+    """Test CUTLASS operations on a single expert with CORRECTED dimensions"""
+    M, N = grad_expert.shape  # [M, N] - grad_expert
+    M_i, K = input_expert.shape  # [M, K] - input_expert
+    N_w, K_w = weight_expert.shape  # [N, K] - weight_expert
+
+    assert (
+        M == M_i and N == N_w and K == K_w
+    ), f"Shape mismatch: {grad_expert.shape}, {input_expert.shape}, {weight_expert.shape}"
+
+    device = grad_expert.device
+
+    print(
+        f"      Shapes: grad_expert={grad_expert.shape}, input_expert={input_expert.shape}, weight_expert={weight_expert.shape}"
+    )
+
+    # Test input gradient: dX = dY @ W
+    print(f"      Testing input gradient: dX = dY @ W")
+    ref_grad_input = torch.mm(grad_expert, weight_expert)  # [M, N] @ [N, K] = [M, K]
+    print(
+        f"      Reference dX shape: {ref_grad_input.shape}, norm: {ref_grad_input.norm().item():.4f}"
+    )
+
+    # CUTLASS approach: Since CUTLASS computes A @ B^T, reformulate dX = dY @ W as:
+    # dX^T = W^T @ dY^T, then transpose result
+    # A = W^T [K, N], B = dY^T [N, M], C = dX^T [K, M]
+    weight_T = weight_expert.t().contiguous()  # [K, N]
+    grad_T = grad_expert.t().contiguous()  # [N, M]
+    result_T = torch.zeros(K, M, dtype=strategy.DTYPE_TORCH, device=device)  # [K, M]
+
+    print(
+        f"      CUTLASS matrices: A=W^T{weight_T.shape}, B=dY^T{grad_T.shape}, C=dX^T{result_T.shape}"
+    )
+
+    # CORRECT problem size: A[K,N] @ B^T[N,M] = C[K,M]
+    # Note: CUTLASS will transpose B, so B^T becomes dY^T^T = dY
+    problem_sizes = [
+        [K, M, N, 1]
+    ]  # [M, N, K, L] format but our actual computation is [K, M, N, 1]
+
+    print(f"      Problem size: {problem_sizes[0]}")
+
+    # Set up strides for A @ B^T where CUTLASS transposes B
+    A_mnkl = weight_T.unsqueeze(-1).contiguous()  # [K, N, 1]
+    B_mnkl = grad_T.unsqueeze(
+        -1
+    ).contiguous()  # [N, M, 1] - CUTLASS will transpose to [M, N, 1]
+    C_mnkl = result_T.unsqueeze(-1).contiguous()  # [K, M, 1]
+
+    A_strides = list(A_mnkl.stride()[:2])
+    B_strides = list(B_mnkl.stride()[:2])
+    C_strides = list(C_mnkl.stride()[:2])
+
+    strides_abc = [[A_strides, B_strides, C_strides]]
+    ptrs_abc = [[weight_T.data_ptr(), grad_T.data_ptr(), result_T.data_ptr()]]
+
+    print(f"      Strides: A={A_strides}, B={B_strides}, C={C_strides}")
+
+    CUTLASSBackwardGroupGemmDebug._execute_cutlass_kernel_debug(
+        problem_sizes, strides_abc, ptrs_abc, device, strategy, "single_input_grad"
+    )
+
+    grad_input_cutlass = result_T.t()  # Transpose back to [M, K]
+    print(
+        f"      CUTLASS dX shape: {grad_input_cutlass.shape}, norm: {grad_input_cutlass.norm().item():.4f}"
+    )
+
+    input_grad_diff = torch.abs(grad_input_cutlass - ref_grad_input).max().item()
+    input_grad_rel = input_grad_diff / ref_grad_input.abs().max().item()
+    print(
+        f"      Input grad diff: {input_grad_diff:.2e} (relative: {input_grad_rel:.2e})"
+    )
+
+    # Test weight gradient: dW = dY^T @ X
+    print(f"      Testing weight gradient: dW = dY^T @ X")
+    ref_grad_weight = torch.mm(
+        grad_expert.t(), input_expert
+    )  # [N, M] @ [M, K] = [N, K]
+    print(
+        f"      Reference dW shape: {ref_grad_weight.shape}, norm: {ref_grad_weight.norm().item():.4f}"
+    )
+
+    # CUTLASS approach: dW = dY^T @ X
+    # This is already in A @ B^T form if we set B^T = X^T
+    # A = dY^T [N, M], B^T = X^T [K, M], C = dW [N, K]
+    grad_weight_cutlass = torch.zeros(N, K, dtype=strategy.DTYPE_TORCH, device=device)
+    input_T = input_expert.t().contiguous()  # [K, M]
+
+    print(
+        f"      CUTLASS matrices: A=dY^T{grad_T.shape}, B^T=X^T{input_T.shape}, C=dW{grad_weight_cutlass.shape}"
+    )
+
+    # Problem size: A[N,M] @ B^T[K,M] = C[N,K]
+    # CUTLASS will compute A @ B^T = dY^T @ (X^T)^T = dY^T @ X = dW
+    problem_sizes = [[N, K, M, 1]]
+
+    print(f"      Problem size: {problem_sizes[0]}")
+
+    A_mnkl = grad_T.unsqueeze(-1).contiguous()  # [N, M, 1]
+    B_mnkl = input_T.unsqueeze(-1).contiguous()  # [K, M, 1]
+    C_mnkl = grad_weight_cutlass.unsqueeze(-1).contiguous()  # [N, K, 1]
+
+    A_strides = list(A_mnkl.stride()[:2])
+    B_strides = list(B_mnkl.stride()[:2])
+    C_strides = list(C_mnkl.stride()[:2])
+
+    strides_abc = [[A_strides, B_strides, C_strides]]
+    ptrs_abc = [[grad_T.data_ptr(), input_T.data_ptr(), grad_weight_cutlass.data_ptr()]]
+
+    print(f"      Strides: A={A_strides}, B={B_strides}, C={C_strides}")
+
+    CUTLASSBackwardGroupGemmDebug._execute_cutlass_kernel_debug(
+        problem_sizes, strides_abc, ptrs_abc, device, strategy, "single_weight_grad"
+    )
+
+    print(
+        f"      CUTLASS dW shape: {grad_weight_cutlass.shape}, norm: {grad_weight_cutlass.norm().item():.4f}"
+    )
+
+    weight_grad_diff = torch.abs(grad_weight_cutlass - ref_grad_weight).max().item()
+    weight_grad_rel = weight_grad_diff / ref_grad_weight.abs().max().item()
+    print(
+        f"      Weight grad diff: {weight_grad_diff:.2e} (relative: {weight_grad_rel:.2e})"
+    )
+
+    return grad_input_cutlass, grad_weight_cutlass
+
+
+def _prepare_input_grad_approach_2_fixed(
+    grad_output,
+    weight_stack,
+    m_sizes_gpu,
+    m_offsets_gpu,
+    valid_indices,
+    grad_input,
+    strategy,
+):
+    """Prepare input gradient with CORRECTED explicit transpositions"""
+    problem_sizes = []
+    strides_abc = []
+    ptrs_abc = []
+    temp_results = []
+
+    device = grad_output.device
+    valid_sizes = m_sizes_gpu[valid_indices].cpu().tolist()
+    valid_offsets = (
+        (
+            m_offsets_gpu[valid_indices]
+            if len(m_offsets_gpu) > len(valid_indices)
+            else torch.cumsum(
+                torch.cat(
+                    [torch.tensor([0], device=device), m_sizes_gpu[valid_indices][:-1]]
+                ),
+                dim=0,
+            )
+        )
+        .cpu()
+        .tolist()
+    )
+    valid_indices_cpu = valid_indices.cpu().tolist()
+
+    print(f"    Preparing input gradients for {len(valid_indices_cpu)} experts")
+
+    for i, (expert_idx, size, offset) in enumerate(
+        zip(valid_indices_cpu, valid_sizes, valid_offsets)
+    ):
+        if size > 0:
+            grad_expert = grad_output[offset : offset + size].contiguous()  # [M, N]
+            weight_expert = weight_stack[expert_idx].contiguous()  # [N, K]
+
+            M, N = grad_expert.shape
+            N_w, K = weight_expert.shape
+
+            if N != N_w:
+                print(
+                    f"    ERROR: Expert {expert_idx}, N mismatch: grad_expert has {N}, weight has {N_w}"
+                )
+                continue
+
+            print(
+                f"    Expert {expert_idx}: grad_expert{grad_expert.shape}, weight{weight_expert.shape}"
+            )
+
+            # Reformulate dX = dY @ W as dX^T = W^T @ dY^T
+            weight_T = weight_expert.t().contiguous()  # [K, N]
+            grad_T = grad_expert.t().contiguous()  # [N, M]
+            result_T = torch.zeros(
+                K, M, dtype=strategy.DTYPE_TORCH, device=device
+            )  # [K, M]
+            temp_results.append((result_T, offset, size))
+
+            print(
+                f"    CUTLASS setup: W^T{weight_T.shape} @ (dY^T)^T = dX^T{result_T.shape}"
+            )
+
+            # CUTLASS: A @ B^T where A = W^T [K, N], B = dY^T [N, M]
+            # Problem: [K, M, N, 1] since CUTLASS computes A[K,N] @ B^T[N,M] = C[K,M]
+            _add_simple_gemm_fixed(
+                weight_T, grad_T, result_T, problem_sizes, strides_abc, ptrs_abc
+            )
+
+    return problem_sizes, strides_abc, ptrs_abc, temp_results
+
+
+def _prepare_weight_grad_approach_2_fixed(
+    grad_output,
+    input_tokens,
+    m_sizes_gpu,
+    m_offsets_gpu,
+    valid_indices,
+    grad_weight,
+    strategy,
+):
+    """Prepare weight gradient with CORRECTED explicit transpositions"""
+    problem_sizes = []
+    strides_abc = []
+    ptrs_abc = []
+
+    device = grad_output.device
+    valid_sizes = m_sizes_gpu[valid_indices].cpu().tolist()
+    valid_offsets = (
+        (
+            m_offsets_gpu[valid_indices]
+            if len(m_offsets_gpu) > len(valid_indices)
+            else torch.cumsum(
+                torch.cat(
+                    [torch.tensor([0], device=device), m_sizes_gpu[valid_indices][:-1]]
+                ),
+                dim=0,
+            )
+        )
+        .cpu()
+        .tolist()
+    )
+    valid_indices_cpu = valid_indices.cpu().tolist()
+
+    print(f"    Preparing weight gradients for {len(valid_indices_cpu)} experts")
+
+    for i, (expert_idx, size, offset) in enumerate(
+        zip(valid_indices_cpu, valid_sizes, valid_offsets)
+    ):
+        if size > 0:
+            grad_expert = grad_output[offset : offset + size].contiguous()  # [M, N]
+            input_expert = input_tokens[offset : offset + size].contiguous()  # [M, K]
+            weight_grad_expert = grad_weight[expert_idx]  # [N, K]
+
+            M, N = grad_expert.shape
+            M_i, K = input_expert.shape
+
+            if M != M_i:
+                print(
+                    f"    ERROR: Expert {expert_idx}, M mismatch: grad_expert has {M}, input has {M_i}"
+                )
+                continue
+
+            print(
+                f"    Expert {expert_idx}: grad_expert{grad_expert.shape}, input{input_expert.shape}"
+            )
+
+            # dW = dY^T @ X: A = dY^T [N, M], B^T = X^T [K, M], C = dW [N, K]
+            grad_T = grad_expert.t().contiguous()  # [N, M]
+            input_T = input_expert.t().contiguous()  # [K, M]
+
+            print(
+                f"    CUTLASS setup: dY^T{grad_T.shape} @ X = dW{weight_grad_expert.shape}"
+            )
+
+            # CUTLASS: A @ B^T where A = dY^T [N, M], B = X^T [K, M]
+            # Problem: [N, K, M, 1] since CUTLASS computes A[N,M] @ B^T[K,M] = C[N,K]
+            _add_simple_gemm_fixed(
+                grad_T,
+                input_T,
+                weight_grad_expert,
+                problem_sizes,
+                strides_abc,
+                ptrs_abc,
+            )
+
+    return problem_sizes, strides_abc, ptrs_abc, None
+
+
+def _add_simple_gemm_fixed(A, B, C, problem_sizes, strides_abc, ptrs_abc):
+    """Add a simple GEMM operation: C = A @ B^T with CORRECT dimensions"""
+    M, K = A.shape  # A is [M, K]
+    N, K_B = B.shape  # B is [N, K_B], will be transposed to [K_B, N]
+
+    if K != K_B:
+        print(f"    ERROR: Inner dimension mismatch: A has K={K}, B has K_B={K_B}")
+        print(f"    A shape: {A.shape}, B shape: {B.shape}, C shape: {C.shape}")
+        assert False, f"Inner dimension mismatch: {K} != {K_B}"
+
+    L = 1
+
+    # Expected output shape for A[M,K] @ B^T[K,N] = C[M,N]
+    expected_C_shape = (M, N)
+    if C.shape != expected_C_shape:
+        print(
+            f"    ERROR: Output shape mismatch: expected {expected_C_shape}, got {C.shape}"
+        )
+        assert (
+            False
+        ), f"Output shape mismatch: expected {expected_C_shape}, got {C.shape}"
+
+    # Convert to MNKL format
+    A_mnkl = A.unsqueeze(-1).contiguous()  # [M, K, 1]
+    B_mnkl = B.unsqueeze(
+        -1
+    ).contiguous()  # [N, K, 1] - CUTLASS will transpose to [K, N, 1]
+    C_mnkl = C.unsqueeze(-1).contiguous()  # [M, N, 1]
+
+    A_strides = list(A_mnkl.stride()[:2])
+    B_strides = list(B_mnkl.stride()[:2])
+    C_strides = list(C_mnkl.stride()[:2])
+
+    # Problem size: [M, N, K, L] for CUTLASS
+    problem_sizes.append([M, N, K, L])
+    strides_abc.append([A_strides, B_strides, C_strides])
+    ptrs_abc.append([A.data_ptr(), B.data_ptr(), C.data_ptr()])
+
+    print(
+        f"    Added GEMM: A{A.shape} @ B^T{B.shape} = C{C.shape}, problem=[{M}, {N}, {K}, {L}]"
+    )
+
+
+def _backward_approach_2_fixed(
+    grad_output, input_tokens, weight_stack, m_sizes_gpu, m_offsets_gpu, strategy
+):
+    """
+    Approach 2: Use CUTLASS with explicit transpositions - FIXED VERSION
+    """
+    print("  🔧 Using Approach 2: CUTLASS with explicit transpositions (FIXED)")
+
+    device = grad_output.device
+    grad_input = torch.zeros_like(input_tokens)
+    grad_weight = torch.zeros_like(weight_stack)
+
+    valid_mask = m_sizes_gpu > 0
+    valid_indices = torch.nonzero(valid_mask, as_tuple=True)[0]
+
+    if len(valid_indices) == 0:
+        return grad_input, grad_weight
+
+    print(f"  Processing {len(valid_indices)} experts")
+
+    # Input gradient: dX = dY @ W (reformulated as dX^T = W^T @ dY^T)
+    print("  Computing input gradients...")
+    input_problems = _prepare_input_grad_approach_2_fixed(
+        grad_output,
+        weight_stack,
+        m_sizes_gpu,
+        m_offsets_gpu,
+        valid_indices,
+        grad_input,
+        strategy,
+    )
+
+    if input_problems[0]:  # Has problems
+        print(f"  Executing {len(input_problems[0])} input gradient problems")
+        CUTLASSBackwardGroupGemmDebug._execute_cutlass_kernel_debug(
+            *input_problems[:3], device, strategy, "input_grad"
+        )
+
+        # Reconstruct input gradients from transposed results
+        temp_results = input_problems[3]
+        for result_T, offset, size in temp_results:
+            grad_input[offset : offset + size] = (
+                result_T.t()
+            )  # Transpose back to [M, K]
+
+    # Weight gradient: dW = dY^T @ X
+    print("  Computing weight gradients...")
+    weight_problems = _prepare_weight_grad_approach_2_fixed(
+        grad_output,
+        input_tokens,
+        m_sizes_gpu,
+        m_offsets_gpu,
+        valid_indices,
+        grad_weight,
+        strategy,
+    )
+
+    if weight_problems[0]:  # Has problems
+        print(f"  Executing {len(weight_problems[0])} weight gradient problems")
+        CUTLASSBackwardGroupGemmDebug._execute_cutlass_kernel_debug(
+            *weight_problems, device, strategy, "weight_grad"
+        )
+
+    return grad_input, grad_weight
+
+
+# Update the test_single_expert_cutlass function in the debug class
+def patch_debug_class():
+    """Patch the debug class with fixed methods"""
+    # Replace the broken method
+    CUTLASSBackwardGroupGemmDebug._test_single_expert_cutlass = staticmethod(
+        _test_single_expert_cutlass_fixed
+    )
+    CUTLASSBackwardGroupGemmDebug._backward_approach_2 = staticmethod(
+        _backward_approach_2_fixed
+    )
+    CUTLASSBackwardGroupGemmDebug._prepare_input_grad_approach_2 = staticmethod(
+        _prepare_input_grad_approach_2_fixed
+    )
+    CUTLASSBackwardGroupGemmDebug._prepare_weight_grad_approach_2 = staticmethod(
+        _prepare_weight_grad_approach_2_fixed
+    )
+    CUTLASSBackwardGroupGemmDebug._add_simple_gemm = staticmethod(
+        _add_simple_gemm_fixed
+    )
+    print("✅ Patched debug class with fixed methods")
+
+
+def test_fixed_implementation():
+    """Test the fixed implementation"""
+    print("🧪 Testing Fixed CUTLASS Implementation")
+    print("=" * 50)
+
+    # Import and patch
+    try:
+        from cutlass_backwards_debug import (
+            CUTLASSBackwardGroupGemmDebug,
+            CUTLASSGroupedGemmStrategyDebug,
+            CUTLASSGroupedLinearDebug,
+        )
+
+        patch_debug_class()
+    except ImportError:
+        print("❌ Cannot import debug modules")
+        return
+
+    device = torch.device("cuda")
+    dtype = torch.bfloat16
+
+    # Test single expert first
+    print("\n🔍 Testing Fixed Single Expert")
+    print("-" * 30)
+
+    M, N, K = 32, 64, 128
+    grad_expert = torch.randn(M, N, dtype=dtype, device=device)
+    input_expert = torch.randn(M, K, dtype=dtype, device=device)
+    weight_expert = torch.randn(N, K, dtype=dtype, device=device)
+
+    strategy = CUTLASSGroupedGemmStrategyDebug(
+        debug_mode=True,
+        backward_method="approach_3",
+        use_2cta_instrs=False,
+        mma_tiler_mn=(128, 128),
+        cluster_shape_mn=(1, 1),
+    )
+
+    try:
+        grad_input_cutlass, grad_weight_cutlass = _test_single_expert_cutlass_fixed(
+            grad_expert, input_expert, weight_expert, strategy
+        )
+        print("✅ Fixed single expert test completed")
+
+    except Exception as e:
+        print(f"❌ Fixed single expert test failed: {e}")
+        import traceback
+
+        traceback.print_exc()
+        return
+
+    # Test grouped operations
+    print("\n🔍 Testing Fixed Grouped Operations")
+    print("-" * 35)
+
+    num_experts = 4
+    in_features = 256
+    out_features = 512
+    total_tokens = 128
+
+    strategy = CUTLASSGroupedGemmStrategyDebug(
+        debug_mode=True,
+        backward_method="approach_2",  # Use the fixed approach_2
+        use_2cta_instrs=False,
+        mma_tiler_mn=(128, 128),
+        cluster_shape_mn=(1, 1),
+    )
+
+    try:
+        input_tokens = torch.randn(
+            total_tokens, in_features, dtype=dtype, device=device, requires_grad=True
+        )
+        expert_assignments = torch.randint(
+            0, num_experts, (total_tokens,), device=device
+        )
+
+        layer = CUTLASSGroupedLinearDebug(
+            num_experts, in_features, out_features, strategy, dtype=dtype
+        )
+        layer = layer.to(device)
+
+        # Forward pass
+        output = layer(input_tokens, expert_assignments)
+
+        # Backward pass
+        loss = output.sum()
+        loss.backward()
+
+        print("✅ Fixed grouped operations completed successfully")
+
+        # Check gradient magnitudes
+        if input_tokens.grad is not None:
+            print(f"   Input grad norm: {input_tokens.grad.norm().item():.4f}")
+        if layer.weight.grad is not None:
+            print(f"   Weight grad norm: {layer.weight.grad.norm().item():.4f}")
+
+    except Exception as e:
+        print(f"❌ Fixed grouped operations failed: {e}")
+        import traceback
+
+        traceback.print_exc()
+
+
 if __name__ == "__main__":
-    test_cutlass_backward_group_gemm()
+    test_fixed_implementation()
+
+# if __name__ == "__main__":
+#   test_cutlass_backward_group_gemm()
diff --git a/torchtitan/experiments/deepseek_v3/dsl_back_standalone.py b/torchtitan/experiments/deepseek_v3/dsl_back_standalone.py
new file mode 100644
index 000000000..20e34afbc
--- /dev/null
+++ b/torchtitan/experiments/deepseek_v3/dsl_back_standalone.py
@@ -0,0 +1,669 @@
+#!/usr/bin/env python3
+"""
+Standalone CUTLASS backward pass test.
+Self-contained with no external dependencies beyond basic CUTLASS.
+
+
+current:
+
+CUTLASS computation:
+  Executing backward_input: Atorch.Size([32, 64]) @ B^Ttorch.Size([64, 128]) = Ctorch.Size([32, 128])
+    Problem: [32, 128, 64, 1]
+    Strides: [[64, 1], [128, 1], [128, 1]]
+max_dynamic_shared_memory: 232448
+max_active_blocks: 1
+    Compiling kernel for backward_input...
+    ✅ Kernel compiled
+    ✅ backward_input executed
+❌ Complete Backward crashed: Inner dimension mismatch: 32 != 128
+Traceback (most recent call last):
+  File "/data/users/less/torchtitan/torchtitan/experiments/deepseek_v3/dsl_back_standalone.py", line 576, in main
+    success = test_func()
+              ^^^^^^^^^^^
+  File "/data/users/less/torchtitan/torchtitan/experiments/deepseek_v3/dsl_back_standalone.py", line 409, in test_complete_backward
+    strategy.execute_cutlass_gemm(dY_T, X, dW_cutlass, "backward_weight")
+  File "/data/users/less/torchtitan/torchtitan/experiments/deepseek_v3/dsl_back_standalone.py", line 123, in execute_cutlass_gemm
+    assert K == K_B, f"Inner dimension mismatch: {K} != {K_B}"
+           ^^^^^^^^
+AssertionError: Inner dimension mismatch: 32 != 128
+
+============================================================
+
+🔍 Testing Grouped Backward (2 experts)
+=============================================
+🔧 Initializing standalone CUTLASS strategy...
+cute hardware - device_id 0
+cute hardware - driver_version 12080
+max_dynamic_shared_memory: 232448
+max_active_blocks: 1
+✅ Strategy initialized (max_active_clusters: 148)
+Setup: 2 experts, 16 tokens each
+X: torch.Size([32, 64]), W: torch.Size([2, 128, 64]), dY: torch.Size([32, 128])
+Reference dX norm: 520.0000
+Reference dW norm: 520.0000
+
+Expert 0:
+❌ Grouped Backward crashed: Inner dimension mismatch: 128 != 16
+Traceback (most recent call last):
+  File "/data/users/less/torchtitan/torchtitan/experiments/deepseek_v3/dsl_back_standalone.py", line 576, in main
+    success = test_func()
+              ^^^^^^^^^^^
+  File "/data/users/less/torchtitan/torchtitan/experiments/deepseek_v3/dsl_back_standalone.py", line 509, in test_grouped_backward
+    strategy.execute_cutlass_gemm(W_T, dY_T, dX_T, f"expert_{expert_idx}_input")
+  File "/data/users/less/torchtitan/torchtitan/experiments/deepseek_v3/dsl_back_standalone.py", line 123, in execute_cutlass_gemm
+    assert K == K_B, f"Inner dimension mismatch: {K} != {K_B}"
+           ^^^^^^^^
+AssertionError: Inner dimension mismatch: 128 != 16
+
+============================================================
+📊 FINAL RESULTS
+============================================================
+Basic CUTLASS GEMM   ✅ PASS
+Input Gradient       ✅ PASS
+Weight Gradient      ✅ PASS
+Complete Backward    💥 CRASH
+Grouped Backward     💥 CRASH
+
+Overall: 3/5 tests passed
+"""
+
+import torch
+import torch.nn as nn
+
+try:
+    import cuda.bindings.driver as cuda
+    import cutlass
+    import cutlass.cute as cute
+    import cutlass.utils as utils
+    from cutlass.cute.runtime import from_dlpack
+    from torchtitan.experiments.kernels.blackwell.cute_grouped_gemm import (
+        GroupedGemmKernel,
+    )
+
+    HAS_CUTLASS = True
+    print("✅ CUTLASS imports successful")
+except ImportError as e:
+    HAS_CUTLASS = False
+    print(f"❌ CUTLASS import failed: {e}")
+    exit(1)
+
+
+class StandaloneCutlassStrategy:
+    """Self-contained CUTLASS strategy for testing"""
+
+    def __init__(self):
+        print("🔧 Initializing standalone CUTLASS strategy...")
+
+        # Force CUDA context creation
+        dummy = torch.zeros(1, device="cuda")
+        dummy.cpu()
+
+        self.DTYPE_TORCH = torch.bfloat16
+        self.DTYPE_CUTLASS = cutlass.BFloat16
+        self.ACC_DTYPE = cutlass.Float32
+        self.ALIGNMENT = 16
+
+        # Initialize kernel
+        self.grouped_gemm = GroupedGemmKernel(
+            acc_dtype=self.ACC_DTYPE,
+            use_2cta_instrs=False,
+            mma_tiler_mn=(128, 128),
+            cluster_shape_mn=(1, 1),
+            tensormap_update_mode=utils.TensorMapUpdateMode.SMEM,
+        )
+
+        # Initialize hardware info
+        self.hardware_info = utils.HardwareInfo()
+        self.max_active_clusters = self.hardware_info.get_max_active_clusters(1)
+
+        torch_stream = torch.cuda.current_stream()
+        self.stream = cuda.CUstream(torch_stream.cuda_stream)
+
+        self._compiled_kernels = {}
+        self._tensormap_buffers = {}
+
+        print(
+            f"✅ Strategy initialized (max_active_clusters: {self.max_active_clusters})"
+        )
+
+    def _get_tensormap_buffer(self, device):
+        if device not in self._tensormap_buffers:
+            sm_count = self.hardware_info.get_max_active_clusters(1)
+            tensormap_tensor = torch.zeros(
+                (sm_count, 3, 128 // 8),
+                dtype=torch.int64,
+                device=device,
+            )
+            self._tensormap_buffers[device] = from_dlpack(
+                tensormap_tensor, assumed_align=self.ALIGNMENT
+            )
+        return self._tensormap_buffers[device]
+
+    def _compute_total_clusters(self, problem_sizes):
+        cluster_tile_m = 128
+        cluster_tile_n = 128
+
+        total = 0
+        for M, N, K, L in problem_sizes:
+            clusters_m = (M + cluster_tile_m - 1) // cluster_tile_m
+            clusters_n = (N + cluster_tile_n - 1) // cluster_tile_n
+            total += clusters_m * clusters_n
+        return total
+
+    def _create_initial_tensors(self, problem_shape, device):
+        M, N, K, L = problem_shape
+
+        tensors = [
+            torch.randn(M, K, dtype=self.DTYPE_TORCH, device=device),
+            torch.randn(N, K, dtype=self.DTYPE_TORCH, device=device),
+            torch.zeros(M, N, dtype=self.DTYPE_TORCH, device=device),
+        ]
+
+        cute_tensors = []
+        for tensor in tensors:
+            mnkl_tensor = tensor.unsqueeze(-1).contiguous()
+            cute_tensor = from_dlpack(mnkl_tensor, assumed_align=self.ALIGNMENT)
+            cute_tensor.element_type = self.DTYPE_CUTLASS
+            cute_tensor = cute_tensor.mark_layout_dynamic(leading_dim=1)
+            cute_tensors.append(cute_tensor)
+
+        return cute_tensors
+
+    def execute_cutlass_gemm(self, A, B, C, operation_name="gemm"):
+        """Execute a single CUTLASS GEMM: C = A @ B^T"""
+        M, K = A.shape
+        N, K_B = B.shape
+
+        # For input gradient computation: dX = dY @ W
+        # dY is [M, N] and W is [N, K], so we need to handle this special case
+        if operation_name == "backward_input":
+            # For backward_input, we expect A=dY [M,N] and B=W [N,K]
+            # The inner dimensions should match (N == N)
+            assert K == N, f"Inner dimension mismatch for backward_input: {K} != {N}"
+            # Swap K_B and N for the assertion below
+            K_B, N = N, K_B
+
+        assert K == K_B, f"Inner dimension mismatch: {K} != {K_B}"
+        assert C.shape == (
+            M,
+            N,
+        ), f"Output shape mismatch: expected ({M}, {N}), got {C.shape}"
+
+        L = 1
+        device = A.device
+
+        print(f"  Executing {operation_name}: A{A.shape} @ B^T{B.shape} = C{C.shape}")
+
+        # Convert to MNKL format
+        A_mnkl = A.unsqueeze(-1).contiguous()
+        B_mnkl = B.unsqueeze(-1).contiguous()
+        C_mnkl = C.unsqueeze(-1).contiguous()
+
+        # Problem setup
+        problem_sizes = [[M, N, K, L]]
+        strides_abc = [
+            [
+                list(A_mnkl.stride()[:2]),
+                list(B_mnkl.stride()[:2]),
+                list(C_mnkl.stride()[:2]),
+            ]
+        ]
+        ptrs_abc = [[A.data_ptr(), B.data_ptr(), C.data_ptr()]]
+
+        print(f"    Problem: {problem_sizes[0]}")
+        print(f"    Strides: {strides_abc[0]}")
+
+        # Execute kernel
+        self._execute_kernel(
+            problem_sizes, strides_abc, ptrs_abc, device, operation_name
+        )
+
+        return C
+
+    def _execute_kernel(
+        self, problem_sizes, strides_abc, ptrs_abc, device, operation_name
+    ):
+        """Execute the CUTLASS kernel"""
+        num_groups = len(problem_sizes)
+
+        # Convert to tensors
+        problem_sizes_tensor = torch.tensor(
+            problem_sizes, dtype=torch.int32, device=device
+        )
+        strides_tensor = torch.tensor(strides_abc, dtype=torch.int32, device=device)
+        ptrs_tensor = torch.tensor(ptrs_abc, dtype=torch.int64, device=device)
+
+        # Convert to CUTE tensors
+        problem_sizes_cute = from_dlpack(
+            problem_sizes_tensor, assumed_align=self.ALIGNMENT
+        )
+        strides_cute = from_dlpack(strides_tensor, assumed_align=self.ALIGNMENT)
+        ptrs_cute = from_dlpack(ptrs_tensor, assumed_align=self.ALIGNMENT)
+
+        # Get buffers
+        tensormap_cute = self._get_tensormap_buffer(device)
+        total_clusters = self._compute_total_clusters(problem_sizes)
+        initial_tensors = self._create_initial_tensors(problem_sizes[0], device)
+
+        # Compile kernel if needed
+        cache_key = (num_groups, total_clusters, tuple(problem_sizes[0][:3]))
+
+        if cache_key not in self._compiled_kernels:
+            print(f"    Compiling kernel for {operation_name}...")
+            self._compiled_kernels[cache_key] = cute.compile(
+                self.grouped_gemm,
+                *initial_tensors,
+                num_groups,
+                problem_sizes_cute,
+                strides_cute,
+                ptrs_cute,
+                total_clusters,
+                tensormap_cute,
+                self.max_active_clusters,
+                self.stream,
+            )
+            print(f"    ✅ Kernel compiled")
+
+        # Execute
+        compiled_kernel = self._compiled_kernels[cache_key]
+        compiled_kernel(
+            *initial_tensors,
+            problem_sizes_cute,
+            strides_cute,
+            ptrs_cute,
+            tensormap_cute,
+            self.stream,
+        )
+        torch.cuda.synchronize()
+        print(f"    ✅ {operation_name} executed")
+
+
+def test_basic_cutlass_gemm():
+    """Test basic CUTLASS GEMM operation"""
+    print("\n🔍 Testing Basic CUTLASS GEMM")
+    print("=" * 40)
+
+    device = torch.device("cuda")
+    dtype = torch.bfloat16
+
+    strategy = StandaloneCutlassStrategy()
+
+    # Test matrices
+    M, N, K = 64, 128, 256
+    A = torch.randn(M, K, dtype=dtype, device=device)
+    B = torch.randn(N, K, dtype=dtype, device=device)
+    C = torch.zeros(M, N, dtype=dtype, device=device)
+
+    print(f"Testing: A{A.shape} @ B^T{B.shape} = C{C.shape}")
+
+    # Reference result
+    C_ref = torch.mm(A, B.t())
+    print(f"Reference norm: {C_ref.norm().item():.4f}")
+
+    # CUTLASS result
+    strategy.execute_cutlass_gemm(A, B, C, "basic_test")
+    print(f"CUTLASS norm: {C.norm().item():.4f}")
+
+    # Compare
+    diff = torch.abs(C - C_ref).max().item()
+    rel_diff = diff / C_ref.abs().max().item()
+
+    print(f"Max difference: {diff:.2e}")
+    print(f"Relative difference: {rel_diff:.2e}")
+
+    if rel_diff < 1e-2:
+        print("✅ Basic CUTLASS GEMM works!")
+        return True
+    else:
+        print("❌ Basic CUTLASS GEMM failed!")
+        return False
+
+
+def test_input_gradient():
+    """Test input gradient computation: dX = dY @ W"""
+    print("\n🔍 Testing Input Gradient")
+    print("=" * 30)
+
+    device = torch.device("cuda")
+    dtype = torch.bfloat16
+
+    strategy = StandaloneCutlassStrategy()
+
+    # Problem: dX = dY @ W where dY:[M,N], W:[N,K] -> dX:[M,K]
+    M, N, K = 32, 64, 128
+    dY = torch.randn(M, N, dtype=dtype, device=device)
+    W = torch.randn(N, K, dtype=dtype, device=device)
+
+    print(f"Computing: dX = dY{dY.shape} @ W{W.shape}")
+
+    # Reference PyTorch
+    dX_ref = torch.mm(dY, W)  # [M,N] @ [N,K] = [M,K]
+    print(f"Reference dX: {dX_ref.shape}, norm: {dX_ref.norm().item():.4f}")
+
+    # CUTLASS approach: reformulate as dX^T = W^T @ dY^T
+    print("CUTLASS approach: dX^T = W^T @ dY^T")
+
+    W_T = W.t().contiguous()  # [K, N]
+    dY_T = dY.t().contiguous()  # [N, M]
+    dX_T = torch.zeros(K, M, dtype=dtype, device=device)  # [K, M]
+
+    print(f"  W^T{W_T.shape} @ (dY^T)^T{dY_T.shape} = dX^T{dX_T.shape}")
+    print(f"  Note: CUTLASS computes W^T @ dY^T^T = W^T @ dY")
+
+    # Execute: W^T @ dY^T^T (CUTLASS transposes second operand)
+    strategy.execute_cutlass_gemm(W_T, dY, dX_T, "input_gradient")
+
+    # Transpose back to get dX
+    dX_cutlass = dX_T.t()  # [M, K]
+    print(f"CUTLASS dX: {dX_cutlass.shape}, norm: {dX_cutlass.norm().item():.4f}")
+
+    # Compare
+    diff = torch.abs(dX_cutlass - dX_ref).max().item()
+    rel_diff = diff / dX_ref.abs().max().item()
+
+    print(f"Max difference: {diff:.2e}")
+    print(f"Relative difference: {rel_diff:.2e}")
+
+    if rel_diff < 1e-2:
+        print("✅ Input gradient works!")
+        return True
+    else:
+        print("❌ Input gradient failed!")
+        print(f"First few elements - Ref: {dX_ref.flatten()[:5]}")
+        print(f"First few elements - CUTLASS: {dX_cutlass.flatten()[:5]}")
+        return False
+
+
+def test_weight_gradient():
+    """Test weight gradient computation: dW = dY^T @ X"""
+    print("\n🔍 Testing Weight Gradient")
+    print("=" * 30)
+
+    device = torch.device("cuda")
+    dtype = torch.bfloat16
+
+    strategy = StandaloneCutlassStrategy()
+
+    # Problem: dW = dY^T @ X where dY:[M,N], X:[M,K] -> dW:[N,K]
+    M, N, K = 32, 64, 128
+    dY = torch.randn(M, N, dtype=dtype, device=device)
+    X = torch.randn(M, K, dtype=dtype, device=device)
+
+    print(f"Computing: dW = dY^T{dY.shape} @ X{X.shape}")
+
+    # Reference PyTorch
+    dW_ref = torch.mm(dY.t(), X)  # [N,M] @ [M,K] = [N,K]
+    print(f"Reference dW: {dW_ref.shape}, norm: {dW_ref.norm().item():.4f}")
+
+    # CUTLASS approach: dW = dY^T @ X
+    # Since CUTLASS computes A @ B^T, we use A = dY^T, B^T = X^T
+    # So CUTLASS computes dY^T @ (X^T)^T = dY^T @ X = dW
+    print("CUTLASS approach: dY^T @ X using A @ B^T format")
+
+    dY_T = dY.t().contiguous()  # [N, M]
+    X_T = X.t().contiguous()  # [K, M]
+    dW_cutlass = torch.zeros(N, K, dtype=dtype, device=device)  # [N, K]
+
+    print(f"  dY^T{dY_T.shape} @ (X^T)^T{X_T.shape} = dW{dW_cutlass.shape}")
+    print(f"  Note: CUTLASS computes dY^T @ X^T^T = dY^T @ X")
+
+    # Execute: dY^T @ X^T^T (CUTLASS transposes second operand)
+    strategy.execute_cutlass_gemm(dY_T, X_T, dW_cutlass, "weight_gradient")
+
+    print(f"CUTLASS dW: {dW_cutlass.shape}, norm: {dW_cutlass.norm().item():.4f}")
+
+    # Compare
+    diff = torch.abs(dW_cutlass - dW_ref).max().item()
+    rel_diff = diff / dW_ref.abs().max().item()
+
+    print(f"Max difference: {diff:.2e}")
+    print(f"Relative difference: {rel_diff:.2e}")
+
+    if rel_diff < 1e-2:
+        print("✅ Weight gradient works!")
+        return True
+    else:
+        print("❌ Weight gradient failed!")
+        print(f"First few elements - Ref: {dW_ref.flatten()[:5]}")
+        print(f"First few elements - CUTLASS: {dW_cutlass.flatten()[:5]}")
+        return False
+
+
+def test_complete_backward():
+    """Test complete backward pass for a single expert"""
+    print("\n🔍 Testing Complete Backward Pass")
+    print("=" * 40)
+
+    device = torch.device("cuda")
+    dtype = torch.bfloat16
+
+    strategy = StandaloneCutlassStrategy()
+
+    # Problem setup: Y = X @ W^T, given dY, compute dX and dW
+    M, N, K = 32, 64, 128
+    X = torch.randn(M, K, dtype=dtype, device=device)  # Input [M, K]
+    W = torch.randn(N, K, dtype=dtype, device=device)  # Weight [N, K]
+    dY = torch.randn(M, N, dtype=dtype, device=device)  # Upstream grad [M, N]
+
+    print(f"Forward was: Y = X{X.shape} @ W^T{W.shape}")
+    print(f"Given upstream grad dY{dY.shape}")
+    print(f"Computing dX and dW...")
+
+    # Reference PyTorch backward
+    dX_ref = torch.mm(dY, W)  # [M,N] @ [N,K] = [M,K]
+    dW_ref = torch.mm(dY.t(), X)  # [N,M] @ [M,K] = [N,K]
+
+    print(f"Reference dX: {dX_ref.shape}, norm: {dX_ref.norm().item():.4f}")
+    print(f"Reference dW: {dW_ref.shape}, norm: {dW_ref.norm().item():.4f}")
+
+    # CUTLASS backward
+    print("\nCUTLASS computation:")
+
+    # Input gradient: dX = dY @ W
+    dX_cutlass = torch.zeros(M, K, dtype=dtype, device=device)
+
+    # For input gradient, we need to handle the special case in execute_cutlass_gemm
+    strategy.execute_cutlass_gemm(dY, W, dX_cutlass, "backward_input")
+
+    # Weight gradient: dW = dY^T @ X
+    dY_T = dY.t().contiguous()  # [N, M]
+    dW_cutlass = torch.zeros(N, K, dtype=dtype, device=device)
+
+    strategy.execute_cutlass_gemm(dY_T, X, dW_cutlass, "backward_weight")
+
+    print(f"CUTLASS dX: {dX_cutlass.shape}, norm: {dX_cutlass.norm().item():.4f}")
+    print(f"CUTLASS dW: {dW_cutlass.shape}, norm: {dW_cutlass.norm().item():.4f}")
+
+    # Compare both gradients
+    dX_diff = torch.abs(dX_cutlass - dX_ref).max().item()
+    dX_rel_diff = dX_diff / dX_ref.abs().max().item()
+
+    dW_diff = torch.abs(dW_cutlass - dW_ref).max().item()
+    dW_rel_diff = dW_diff / dW_ref.abs().max().item()
+
+    print(f"\nComparison:")
+    print(f"dX max diff: {dX_diff:.2e} (relative: {dX_rel_diff:.2e})")
+    print(f"dW max diff: {dW_diff:.2e} (relative: {dW_rel_diff:.2e})")
+
+    success = dX_rel_diff < 1e-2 and dW_rel_diff < 1e-2
+
+    if success:
+        print("✅ Complete backward pass works!")
+    else:
+        print("❌ Complete backward pass failed!")
+        if dX_rel_diff >= 1e-2:
+            print("   Input gradient has large errors")
+        if dW_rel_diff >= 1e-2:
+            print("   Weight gradient has large errors")
+
+    return success
+
+
+def test_grouped_backward():
+    """Test backward pass with multiple experts (minimal version)"""
+    print("\n🔍 Testing Grouped Backward (2 experts)")
+    print("=" * 45)
+
+    device = torch.device("cuda")
+    dtype = torch.bfloat16
+
+    strategy = StandaloneCutlassStrategy()
+
+    # Setup: 2 experts, simple token distribution
+    num_experts = 2
+    tokens_per_expert = 16
+    total_tokens = num_experts * tokens_per_expert
+    in_features = 64
+    out_features = 128
+
+    # Create data
+    X = torch.randn(total_tokens, in_features, dtype=dtype, device=device)
+    W = torch.randn(num_experts, out_features, in_features, dtype=dtype, device=device)
+    dY = torch.randn(total_tokens, out_features, dtype=dtype, device=device)
+
+    print(f"Setup: {num_experts} experts, {tokens_per_expert} tokens each")
+    print(f"X: {X.shape}, W: {W.shape}, dY: {dY.shape}")
+
+    # Reference PyTorch grouped backward
+    dX_ref = torch.zeros_like(X)
+    dW_ref = torch.zeros_like(W)
+
+    for expert_idx in range(num_experts):
+        start_idx = expert_idx * tokens_per_expert
+        end_idx = start_idx + tokens_per_expert
+
+        expert_X = X[start_idx:end_idx]  # [tokens_per_expert, in_features]
+        expert_W = W[expert_idx]  # [out_features, in_features]
+        expert_dY = dY[start_idx:end_idx]  # [tokens_per_expert, out_features]
+
+        # Compute gradients for this expert
+        expert_dX = torch.mm(
+            expert_dY, expert_W
+        )  # [tokens, in] = [tokens, out] @ [out, in]
+        expert_dW = torch.mm(
+            expert_dY.t(), expert_X
+        )  # [out, in] = [out, tokens] @ [tokens, in]
+
+        dX_ref[start_idx:end_idx] = expert_dX
+        dW_ref[expert_idx] = expert_dW
+
+    print(f"Reference dX norm: {dX_ref.norm().item():.4f}")
+    print(f"Reference dW norm: {dW_ref.norm().item():.4f}")
+
+    # CUTLASS grouped backward
+    dX_cutlass = torch.zeros_like(X)
+    dW_cutlass = torch.zeros_like(W)
+
+    for expert_idx in range(num_experts):
+        start_idx = expert_idx * tokens_per_expert
+        end_idx = start_idx + tokens_per_expert
+
+        expert_X = X[start_idx:end_idx]
+        expert_W = W[expert_idx]
+        expert_dY = dY[start_idx:end_idx]
+
+        print(f"\nExpert {expert_idx}:")
+
+        # Input gradient: dX^T = W^T @ dY^T
+        W_T = expert_W.t().contiguous()
+        dY_T = expert_dY.t().contiguous()
+        dX_T = torch.zeros(in_features, tokens_per_expert, dtype=dtype, device=device)
+
+        strategy.execute_cutlass_gemm(W_T, dY_T, dX_T, f"expert_{expert_idx}_input")
+        dX_cutlass[start_idx:end_idx] = dX_T.t()
+
+        # Weight gradient: dW = dY^T @ X
+        dY_T = expert_dY.t().contiguous()
+        X_T = expert_X.t().contiguous()
+        expert_dW_cutlass = torch.zeros(
+            out_features, in_features, dtype=dtype, device=device
+        )
+
+        strategy.execute_cutlass_gemm(
+            dY_T, X_T, expert_dW_cutlass, f"expert_{expert_idx}_weight"
+        )
+        dW_cutlass[expert_idx] = expert_dW_cutlass
+
+    print(f"\nCUTLASS dX norm: {dX_cutlass.norm().item():.4f}")
+    print(f"CUTLASS dW norm: {dW_cutlass.norm().item():.4f}")
+
+    # Compare
+    dX_diff = torch.abs(dX_cutlass - dX_ref).max().item()
+    dX_rel_diff = dX_diff / dX_ref.abs().max().item()
+
+    dW_diff = torch.abs(dW_cutlass - dW_ref).max().item()
+    dW_rel_diff = dW_diff / dW_ref.abs().max().item()
+
+    print(f"\nComparison:")
+    print(f"dX max diff: {dX_diff:.2e} (relative: {dX_rel_diff:.2e})")
+    print(f"dW max diff: {dW_diff:.2e} (relative: {dW_rel_diff:.2e})")
+
+    success = dX_rel_diff < 1e-2 and dW_rel_diff < 1e-2
+
+    if success:
+        print("✅ Grouped backward pass works!")
+    else:
+        print("❌ Grouped backward pass failed!")
+
+    return success
+
+
+def main():
+    """Main test sequence"""
+    print("🧪 Standalone CUTLASS Backward Test")
+    print("=" * 50)
+
+    if not torch.cuda.is_available():
+        print("❌ CUDA not available")
+        return
+
+    if not HAS_CUTLASS:
+        print("❌ CUTLASS not available")
+        return
+
+    print(f"GPU: {torch.cuda.get_device_name()}")
+    print(f"CUDA Version: {torch.version.cuda}")
+
+    tests = [
+        ("Basic CUTLASS GEMM", test_basic_cutlass_gemm),
+        ("Input Gradient", test_input_gradient),
+        ("Weight Gradient", test_weight_gradient),
+        ("Complete Backward", test_complete_backward),
+        ("Grouped Backward", test_grouped_backward),
+    ]
+
+    results = []
+    for test_name, test_func in tests:
+        print(f"\n" + "=" * 60)
+        try:
+            success = test_func()
+            results.append((test_name, "✅ PASS" if success else "❌ FAIL"))
+        except Exception as e:
+            print(f"❌ {test_name} crashed: {e}")
+            import traceback
+
+            traceback.print_exc()
+            results.append((test_name, "💥 CRASH"))
+
+    # Summary
+    print(f"\n" + "=" * 60)
+    print("📊 FINAL RESULTS")
+    print("=" * 60)
+
+    for test_name, result in results:
+        print(f"{test_name:<20} {result}")
+
+    # Count successes
+    passes = sum(1 for _, result in results if "PASS" in result)
+    total = len(results)
+
+    print(f"\nOverall: {passes}/{total} tests passed")
+
+    if passes == total:
+        print("🎉 All tests passed! CUTLASS backward is working correctly.")
+    else:
+        print("🔧 Some tests failed. Check the specific failures above.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/torchtitan/experiments/deepseek_v3/simple_debug_back.py b/torchtitan/experiments/deepseek_v3/simple_debug_back.py
new file mode 100644
index 000000000..10e95a5b4
--- /dev/null
+++ b/torchtitan/experiments/deepseek_v3/simple_debug_back.py
@@ -0,0 +1,332 @@
+#!/usr/bin/env python3
+"""
+Simple debug script to isolate CUTLASS backward pass issues.
+Run this to identify exactly where the numerical problems occur.
+"""
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+
+def test_single_expert_operations():
+    """Test operations on a single expert to isolate issues"""
+    print("🔍 Testing Single Expert Operations")
+    print("=" * 50)
+
+    device = torch.device("cuda")
+    dtype = torch.bfloat16
+
+    # Simple test case
+    M, N, K = 32, 64, 128  # Small sizes for debugging
+
+    # Create test data
+    X = torch.randn(M, K, dtype=dtype, device=device, requires_grad=True)  # Input
+    W = torch.randn(N, K, dtype=dtype, device=device, requires_grad=True)  # Weight
+
+    print(f"Input X: {X.shape}")
+    print(f"Weight W: {W.shape}")
+
+    # Forward pass: Y = X @ W^T
+    Y = torch.mm(X, W.t())  # [M, N]
+    print(f"Output Y: {Y.shape}")
+
+    # Create upstream gradient
+    dY = torch.randn_like(Y)
+    print(f"Upstream grad dY: {dY.shape}")
+
+    # Compute reference gradients
+    print("\n📊 Reference PyTorch Gradients:")
+    Y_ref = torch.mm(X, W.t())
+    Y_ref.backward(dY, retain_graph=True)
+
+    dX_ref = X.grad.clone()
+    dW_ref = W.grad.clone()
+
+    print(f"dX_ref norm: {dX_ref.norm().item():.4f}")
+    print(f"dW_ref norm: {dW_ref.norm().item():.4f}")
+
+    # Clear gradients
+    X.grad = None
+    W.grad = None
+
+    # Manual gradient computation
+    print("\n🧮 Manual Gradient Computation:")
+    dX_manual = torch.mm(dY, W)  # [M, N] @ [N, K] = [M, K]
+    dW_manual = torch.mm(dY.t(), X)  # [N, M] @ [M, K] = [N, K]
+
+    print(f"dX_manual norm: {dX_manual.norm().item():.4f}")
+    print(f"dW_manual norm: {dW_manual.norm().item():.4f}")
+
+    # Check manual vs reference
+    dX_diff = torch.abs(dX_manual - dX_ref).max().item()
+    dW_diff = torch.abs(dW_manual - dW_ref).max().item()
+
+    print(f"\n✅ Manual vs Reference:")
+    print(f"dX difference: {dX_diff:.2e}")
+    print(f"dW difference: {dW_diff:.2e}")
+
+    if dX_diff < 1e-3 and dW_diff < 1e-3:
+        print("✅ Manual gradients match reference!")
+    else:
+        print("❌ Manual gradients don't match!")
+        return False
+
+    return dX_manual, dW_manual, X, W, dY
+
+
+def test_cutlass_simple_operations():
+    """Test CUTLASS operations step by step"""
+    print("\n🔍 Testing CUTLASS Simple Operations")
+    print("=" * 50)
+
+    try:
+        from cutlass_backwards_debug import (
+            CUTLASSBackwardGroupGemmDebug,
+            CUTLASSGroupedGemmStrategyDebug,
+        )
+    except ImportError:
+        print("❌ Cannot import debug modules")
+        return False
+
+    # Get reference data from single expert test
+    dX_ref, dW_ref, X, W, dY = test_single_expert_operations()
+
+    device = X.device
+    dtype = X.dtype
+
+    # Create debug strategy
+    strategy = CUTLASSGroupedGemmStrategyDebug(
+        debug_mode=True,
+        backward_method="approach_3",  # Single expert debugging
+        use_2cta_instrs=False,
+        mma_tiler_mn=(128, 128),
+        cluster_shape_mn=(1, 1),
+    )
+
+    print(f"\n🔧 Testing CUTLASS Single Expert Operations:")
+
+    # Test single expert CUTLASS operations
+    try:
+        dX_cutlass, dW_cutlass = (
+            CUTLASSBackwardGroupGemmDebug._test_single_expert_cutlass(
+                dY, X, W, strategy
+            )
+        )
+
+        # Compare with reference
+        dX_cutlass_diff = torch.abs(dX_cutlass - dX_ref).max().item()
+        dW_cutlass_diff = torch.abs(dW_cutlass - dW_ref).max().item()
+
+        print(f"\n📊 CUTLASS vs Reference:")
+        print(f"dX difference: {dX_cutlass_diff:.2e}")
+        print(f"dW difference: {dW_cutlass_diff:.2e}")
+
+        if dX_cutlass_diff < 1e-2 and dW_cutlass_diff < 1e-2:
+            print("✅ CUTLASS single expert operations working!")
+            return True
+        else:
+            print("❌ CUTLASS single expert operations have issues")
+            return False
+
+    except Exception as e:
+        print(f"❌ CUTLASS single expert test failed: {e}")
+        import traceback
+
+        traceback.print_exc()
+        return False
+
+
+def test_grouped_operations():
+    """Test full grouped operations"""
+    print("\n🔍 Testing Grouped Operations")
+    print("=" * 50)
+
+    try:
+        from cutlass_backwards_debug import (
+            CUTLASSGroupedGemmStrategyDebug,
+            CUTLASSGroupedLinearDebug,
+        )
+    except ImportError:
+        print("❌ Cannot import debug modules")
+        return False
+
+    device = torch.device("cuda")
+    dtype = torch.bfloat16
+
+    # Test parameters
+    num_experts = 4
+    in_features = 256
+    out_features = 512
+    total_tokens = 128
+
+    # Test different approaches
+    approaches = ["approach_1", "approach_2", "approach_3"]
+
+    for approach in approaches:
+        print(f"\n🔧 Testing grouped operations with {approach}")
+
+        try:
+            # Create strategy
+            strategy = CUTLASSGroupedGemmStrategyDebug(
+                debug_mode=True,
+                backward_method=approach,
+                use_2cta_instrs=False,
+                mma_tiler_mn=(128, 128),
+                cluster_shape_mn=(1, 1),
+            )
+
+            # Create test data
+            input_tokens = torch.randn(
+                total_tokens,
+                in_features,
+                dtype=dtype,
+                device=device,
+                requires_grad=True,
+            )
+            expert_assignments = torch.randint(
+                0, num_experts, (total_tokens,), device=device
+            )
+
+            # Create layer
+            layer = CUTLASSGroupedLinearDebug(
+                num_experts, in_features, out_features, strategy, dtype=dtype
+            )
+            layer = layer.to(device)
+
+            # Forward pass
+            output = layer(input_tokens, expert_assignments)
+
+            # Backward pass
+            loss = output.sum()
+            loss.backward()
+
+            print(f"✅ {approach} completed successfully")
+
+            # Check if gradients exist and are reasonable
+            if input_tokens.grad is not None:
+                input_grad_norm = input_tokens.grad.norm().item()
+                print(f"   Input grad norm: {input_grad_norm:.4f}")
+            else:
+                print("   ❌ No input gradient!")
+
+            if layer.weight.grad is not None:
+                weight_grad_norm = layer.weight.grad.norm().item()
+                print(f"   Weight grad norm: {weight_grad_norm:.4f}")
+            else:
+                print("   ❌ No weight gradient!")
+
+        except Exception as e:
+            print(f"❌ {approach} failed: {e}")
+            import traceback
+
+            traceback.print_exc()
+
+
+def debug_matrix_operations():
+    """Debug the core matrix operations used in backward pass"""
+    print("\n🔍 Debugging Core Matrix Operations")
+    print("=" * 50)
+
+    device = torch.device("cuda")
+    dtype = torch.bfloat16
+
+    # Test data
+    M, N, K = 16, 32, 64
+
+    A = torch.randn(M, K, dtype=dtype, device=device)
+    B = torch.randn(N, K, dtype=dtype, device=device)
+    dC = torch.randn(M, N, dtype=dtype, device=device)
+
+    print(f"A: {A.shape}, B: {B.shape}, dC: {dC.shape}")
+
+    # Test different formulations of the same operations
+    print("\n📊 Testing Input Gradient Formulations:")
+
+    # Method 1: Direct computation dA = dC @ B
+    dA_direct = torch.mm(dC, B)  # [M, N] @ [N, K] = [M, K]
+    print(f"Method 1 (direct): {dA_direct.shape}, norm: {dA_direct.norm().item():.4f}")
+
+    # Method 2: Transpose formulation dA^T = B^T @ dC^T
+    dA_transpose = torch.mm(B.t(), dC.t()).t()  # [K, N] @ [N, M] = [K, M] -> [M, K]
+    print(
+        f"Method 2 (transpose): {dA_transpose.shape}, norm: {dA_transpose.norm().item():.4f}"
+    )
+
+    # Check if they're the same
+    dA_diff = torch.abs(dA_direct - dA_transpose).max().item()
+    print(f"Difference: {dA_diff:.2e}")
+
+    print("\n📊 Testing Weight Gradient Formulations:")
+
+    # Method 1: Direct computation dB = dC^T @ A
+    dB_direct = torch.mm(dC.t(), A)  # [N, M] @ [M, K] = [N, K]
+    print(f"Method 1 (direct): {dB_direct.shape}, norm: {dB_direct.norm().item():.4f}")
+
+    # Method 2: Using transpose dB^T = A^T @ dC, then transpose
+    dB_transpose = torch.mm(A.t(), dC).t()  # [K, M] @ [M, N] = [K, N] -> [N, K]
+    print(
+        f"Method 2 (transpose): {dB_transpose.shape}, norm: {dB_transpose.norm().item():.4f}"
+    )
+
+    # Check if they're the same
+    dB_diff = torch.abs(dB_direct - dB_transpose).max().item()
+    print(f"Difference: {dB_diff:.2e}")
+
+    if dA_diff < 1e-5 and dB_diff < 1e-5:
+        print("✅ All formulations are mathematically equivalent!")
+        return True
+    else:
+        print("❌ Formulations don't match - there's a mathematical error!")
+        return False
+
+
+def main():
+    """Main debug sequence"""
+    print("🧪 CUTLASS Backward Pass Debug Suite")
+    print("=" * 60)
+
+    # Step 4: Test grouped operations
+    print("\n" + "=" * 60)
+    test_grouped_operations()
+
+    if not torch.cuda.is_available():
+        print("❌ CUDA not available")
+        return
+
+    print(f"GPU: {torch.cuda.get_device_name()}")
+    print(f"CUDA Version: {torch.version.cuda}")
+
+    # Step 1: Test mathematical formulations
+    print("\n" + "=" * 60)
+    if not debug_matrix_operations():
+        print("❌ Mathematical formulations failed - fix before continuing")
+        return
+
+    # Step 2: Test single expert operations
+    print("\n" + "=" * 60)
+    single_expert_result = test_single_expert_operations()
+    if single_expert_result is False:
+        print("❌ Single expert operations failed")
+        return
+
+    # Step 3: Test CUTLASS single expert operations
+    print("\n" + "=" * 60)
+    if not test_cutlass_simple_operations():
+        print("❌ CUTLASS single expert operations failed")
+        return
+
+    # Step 4: Test grouped operations
+    print("\n" + "=" * 60)
+    test_grouped_operations()
+
+    print("\n" + "=" * 60)
+    print("🎯 Debug sequence completed!")
+    print("\nNext steps based on results:")
+    print("1. If single expert works but grouped fails -> issue in batching/metadata")
+    print("2. If CUTLASS single expert fails -> issue in CUTLASS setup")
+    print("3. If mathematical formulations fail -> fundamental math error")
+
+
+if __name__ == "__main__":
+    main()

From d6f5a0398c01cb79b97eff0f08a8a782ea7b233f Mon Sep 17 00:00:00 2001
From: lessw2020 <lessw@etrillium.com>
Date: Sat, 14 Jun 2025 11:33:30 -0700
Subject: [PATCH 17/34] backwards working(!)

---
 .../deepseek_v3/dsl_back_standalone.py        | 250 ++++++++----------
 1 file changed, 108 insertions(+), 142 deletions(-)

diff --git a/torchtitan/experiments/deepseek_v3/dsl_back_standalone.py b/torchtitan/experiments/deepseek_v3/dsl_back_standalone.py
index 20e34afbc..639af9169 100644
--- a/torchtitan/experiments/deepseek_v3/dsl_back_standalone.py
+++ b/torchtitan/experiments/deepseek_v3/dsl_back_standalone.py
@@ -1,70 +1,7 @@
 #!/usr/bin/env python3
 """
-Standalone CUTLASS backward pass test.
-Self-contained with no external dependencies beyond basic CUTLASS.
-
-
-current:
-
-CUTLASS computation:
-  Executing backward_input: Atorch.Size([32, 64]) @ B^Ttorch.Size([64, 128]) = Ctorch.Size([32, 128])
-    Problem: [32, 128, 64, 1]
-    Strides: [[64, 1], [128, 1], [128, 1]]
-max_dynamic_shared_memory: 232448
-max_active_blocks: 1
-    Compiling kernel for backward_input...
-    ✅ Kernel compiled
-    ✅ backward_input executed
-❌ Complete Backward crashed: Inner dimension mismatch: 32 != 128
-Traceback (most recent call last):
-  File "/data/users/less/torchtitan/torchtitan/experiments/deepseek_v3/dsl_back_standalone.py", line 576, in main
-    success = test_func()
-              ^^^^^^^^^^^
-  File "/data/users/less/torchtitan/torchtitan/experiments/deepseek_v3/dsl_back_standalone.py", line 409, in test_complete_backward
-    strategy.execute_cutlass_gemm(dY_T, X, dW_cutlass, "backward_weight")
-  File "/data/users/less/torchtitan/torchtitan/experiments/deepseek_v3/dsl_back_standalone.py", line 123, in execute_cutlass_gemm
-    assert K == K_B, f"Inner dimension mismatch: {K} != {K_B}"
-           ^^^^^^^^
-AssertionError: Inner dimension mismatch: 32 != 128
-
-============================================================
-
-🔍 Testing Grouped Backward (2 experts)
-=============================================
-🔧 Initializing standalone CUTLASS strategy...
-cute hardware - device_id 0
-cute hardware - driver_version 12080
-max_dynamic_shared_memory: 232448
-max_active_blocks: 1
-✅ Strategy initialized (max_active_clusters: 148)
-Setup: 2 experts, 16 tokens each
-X: torch.Size([32, 64]), W: torch.Size([2, 128, 64]), dY: torch.Size([32, 128])
-Reference dX norm: 520.0000
-Reference dW norm: 520.0000
-
-Expert 0:
-❌ Grouped Backward crashed: Inner dimension mismatch: 128 != 16
-Traceback (most recent call last):
-  File "/data/users/less/torchtitan/torchtitan/experiments/deepseek_v3/dsl_back_standalone.py", line 576, in main
-    success = test_func()
-              ^^^^^^^^^^^
-  File "/data/users/less/torchtitan/torchtitan/experiments/deepseek_v3/dsl_back_standalone.py", line 509, in test_grouped_backward
-    strategy.execute_cutlass_gemm(W_T, dY_T, dX_T, f"expert_{expert_idx}_input")
-  File "/data/users/less/torchtitan/torchtitan/experiments/deepseek_v3/dsl_back_standalone.py", line 123, in execute_cutlass_gemm
-    assert K == K_B, f"Inner dimension mismatch: {K} != {K_B}"
-           ^^^^^^^^
-AssertionError: Inner dimension mismatch: 128 != 16
-
-============================================================
-📊 FINAL RESULTS
-============================================================
-Basic CUTLASS GEMM   ✅ PASS
-Input Gradient       ✅ PASS
-Weight Gradient      ✅ PASS
-Complete Backward    💥 CRASH
-Grouped Backward     💥 CRASH
-
-Overall: 3/5 tests passed
+Fixed standalone CUTLASS backward pass test.
+Corrects the dimensional issues in the matrix operations.
 """
 
 import torch
@@ -88,11 +25,11 @@
     exit(1)
 
 
-class StandaloneCutlassStrategy:
-    """Self-contained CUTLASS strategy for testing"""
+class FixedCutlassStrategy:
+    """Fixed CUTLASS strategy with correct dimension handling"""
 
     def __init__(self):
-        print("🔧 Initializing standalone CUTLASS strategy...")
+        print("🔧 Initializing fixed CUTLASS strategy...")
 
         # Force CUDA context creation
         dummy = torch.zeros(1, device="cuda")
@@ -169,31 +106,22 @@ def _create_initial_tensors(self, problem_shape, device):
 
         return cute_tensors
 
-    def execute_cutlass_gemm(self, A, B, C, operation_name="gemm"):
-        """Execute a single CUTLASS GEMM: C = A @ B^T"""
+    def execute_cutlass_gemm_basic(self, A, B, C, operation_name="gemm"):
+        """Execute basic CUTLASS GEMM: C = A @ B^T"""
         M, K = A.shape
         N, K_B = B.shape
 
-        # For input gradient computation: dX = dY @ W
-        # dY is [M, N] and W is [N, K], so we need to handle this special case
-        if operation_name == "backward_input":
-            # For backward_input, we expect A=dY [M,N] and B=W [N,K]
-            # The inner dimensions should match (N == N)
-            assert K == N, f"Inner dimension mismatch for backward_input: {K} != {N}"
-            # Swap K_B and N for the assertion below
-            K_B, N = N, K_B
-
         assert K == K_B, f"Inner dimension mismatch: {K} != {K_B}"
         assert C.shape == (
             M,
             N,
         ), f"Output shape mismatch: expected ({M}, {N}), got {C.shape}"
 
+        print(f"  Executing {operation_name}: A{A.shape} @ B^T{B.shape} = C{C.shape}")
+
         L = 1
         device = A.device
 
-        print(f"  Executing {operation_name}: A{A.shape} @ B^T{B.shape} = C{C.shape}")
-
         # Convert to MNKL format
         A_mnkl = A.unsqueeze(-1).contiguous()
         B_mnkl = B.unsqueeze(-1).contiguous()
@@ -220,6 +148,84 @@ def execute_cutlass_gemm(self, A, B, C, operation_name="gemm"):
 
         return C
 
+    def compute_input_gradient(self, grad_output, weight, operation_name="input_grad"):
+        """
+        Compute input gradient: dX = dY @ W
+
+        Args:
+            grad_output: [M, N] - upstream gradient
+            weight: [N, K] - weight matrix
+
+        Returns:
+            grad_input: [M, K] - input gradient
+        """
+        M, N = grad_output.shape
+        N_w, K = weight.shape
+
+        assert N == N_w, f"Dimension mismatch: grad_output has {N}, weight has {N_w}"
+
+        print(
+            f"  Computing input gradient: dY{grad_output.shape} @ W{weight.shape} = dX[{M}, {K}]"
+        )
+
+        # Since CUTLASS computes A @ B^T, and we want dY @ W:
+        # We can compute this directly as dY @ W where CUTLASS treats W as B^T
+        # So A = dY [M, N], B = W^T [K, N] (so B^T = W [N, K])
+        weight_for_cutlass = (
+            weight.t().contiguous()
+        )  # [K, N] - this will be transposed to [N, K]
+        grad_input = torch.zeros(
+            M, K, dtype=self.DTYPE_TORCH, device=grad_output.device
+        )
+
+        print(
+            f"    CUTLASS setup: dY{grad_output.shape} @ (W^T)^T{weight_for_cutlass.shape} = dX{grad_input.shape}"
+        )
+
+        return self.execute_cutlass_gemm_basic(
+            grad_output, weight_for_cutlass, grad_input, operation_name
+        )
+
+    def compute_weight_gradient(
+        self, grad_output, input_tokens, operation_name="weight_grad"
+    ):
+        """
+        Compute weight gradient: dW = dY^T @ X
+
+        Args:
+            grad_output: [M, N] - upstream gradient
+            input_tokens: [M, K] - input tokens
+
+        Returns:
+            grad_weight: [N, K] - weight gradient
+        """
+        M, N = grad_output.shape
+        M_i, K = input_tokens.shape
+
+        assert M == M_i, f"Dimension mismatch: grad_output has {M}, input has {M_i}"
+
+        print(
+            f"  Computing weight gradient: dY^T{grad_output.shape} @ X{input_tokens.shape} = dW[{N}, {K}]"
+        )
+
+        # Since CUTLASS computes A @ B^T, and we want dY^T @ X:
+        # A = dY^T [N, M], B = X^T [K, M] (so B^T = X [M, K])
+        grad_output_T = grad_output.t().contiguous()  # [N, M]
+        input_for_cutlass = (
+            input_tokens.t().contiguous()
+        )  # [K, M] - this will be transposed to [M, K]
+        grad_weight = torch.zeros(
+            N, K, dtype=self.DTYPE_TORCH, device=grad_output.device
+        )
+
+        print(
+            f"    CUTLASS setup: dY^T{grad_output_T.shape} @ (X^T)^T{input_for_cutlass.shape} = dW{grad_weight.shape}"
+        )
+
+        return self.execute_cutlass_gemm_basic(
+            grad_output_T, input_for_cutlass, grad_weight, operation_name
+        )
+
     def _execute_kernel(
         self, problem_sizes, strides_abc, ptrs_abc, device, operation_name
     ):
@@ -286,7 +292,7 @@ def test_basic_cutlass_gemm():
     device = torch.device("cuda")
     dtype = torch.bfloat16
 
-    strategy = StandaloneCutlassStrategy()
+    strategy = FixedCutlassStrategy()
 
     # Test matrices
     M, N, K = 64, 128, 256
@@ -301,7 +307,7 @@ def test_basic_cutlass_gemm():
     print(f"Reference norm: {C_ref.norm().item():.4f}")
 
     # CUTLASS result
-    strategy.execute_cutlass_gemm(A, B, C, "basic_test")
+    strategy.execute_cutlass_gemm_basic(A, B, C, "basic_test")
     print(f"CUTLASS norm: {C.norm().item():.4f}")
 
     # Compare
@@ -327,7 +333,7 @@ def test_input_gradient():
     device = torch.device("cuda")
     dtype = torch.bfloat16
 
-    strategy = StandaloneCutlassStrategy()
+    strategy = FixedCutlassStrategy()
 
     # Problem: dX = dY @ W where dY:[M,N], W:[N,K] -> dX:[M,K]
     M, N, K = 32, 64, 128
@@ -340,21 +346,8 @@ def test_input_gradient():
     dX_ref = torch.mm(dY, W)  # [M,N] @ [N,K] = [M,K]
     print(f"Reference dX: {dX_ref.shape}, norm: {dX_ref.norm().item():.4f}")
 
-    # CUTLASS approach: reformulate as dX^T = W^T @ dY^T
-    print("CUTLASS approach: dX^T = W^T @ dY^T")
-
-    W_T = W.t().contiguous()  # [K, N]
-    dY_T = dY.t().contiguous()  # [N, M]
-    dX_T = torch.zeros(K, M, dtype=dtype, device=device)  # [K, M]
-
-    print(f"  W^T{W_T.shape} @ (dY^T)^T{dY_T.shape} = dX^T{dX_T.shape}")
-    print(f"  Note: CUTLASS computes W^T @ dY^T^T = W^T @ dY")
-
-    # Execute: W^T @ dY^T^T (CUTLASS transposes second operand)
-    strategy.execute_cutlass_gemm(W_T, dY, dX_T, "input_gradient")
-
-    # Transpose back to get dX
-    dX_cutlass = dX_T.t()  # [M, K]
+    # CUTLASS computation
+    dX_cutlass = strategy.compute_input_gradient(dY, W, "input_gradient")
     print(f"CUTLASS dX: {dX_cutlass.shape}, norm: {dX_cutlass.norm().item():.4f}")
 
     # Compare
@@ -382,7 +375,7 @@ def test_weight_gradient():
     device = torch.device("cuda")
     dtype = torch.bfloat16
 
-    strategy = StandaloneCutlassStrategy()
+    strategy = FixedCutlassStrategy()
 
     # Problem: dW = dY^T @ X where dY:[M,N], X:[M,K] -> dW:[N,K]
     M, N, K = 32, 64, 128
@@ -395,21 +388,8 @@ def test_weight_gradient():
     dW_ref = torch.mm(dY.t(), X)  # [N,M] @ [M,K] = [N,K]
     print(f"Reference dW: {dW_ref.shape}, norm: {dW_ref.norm().item():.4f}")
 
-    # CUTLASS approach: dW = dY^T @ X
-    # Since CUTLASS computes A @ B^T, we use A = dY^T, B^T = X^T
-    # So CUTLASS computes dY^T @ (X^T)^T = dY^T @ X = dW
-    print("CUTLASS approach: dY^T @ X using A @ B^T format")
-
-    dY_T = dY.t().contiguous()  # [N, M]
-    X_T = X.t().contiguous()  # [K, M]
-    dW_cutlass = torch.zeros(N, K, dtype=dtype, device=device)  # [N, K]
-
-    print(f"  dY^T{dY_T.shape} @ (X^T)^T{X_T.shape} = dW{dW_cutlass.shape}")
-    print(f"  Note: CUTLASS computes dY^T @ X^T^T = dY^T @ X")
-
-    # Execute: dY^T @ X^T^T (CUTLASS transposes second operand)
-    strategy.execute_cutlass_gemm(dY_T, X_T, dW_cutlass, "weight_gradient")
-
+    # CUTLASS computation
+    dW_cutlass = strategy.compute_weight_gradient(dY, X, "weight_gradient")
     print(f"CUTLASS dW: {dW_cutlass.shape}, norm: {dW_cutlass.norm().item():.4f}")
 
     # Compare
@@ -437,7 +417,7 @@ def test_complete_backward():
     device = torch.device("cuda")
     dtype = torch.bfloat16
 
-    strategy = StandaloneCutlassStrategy()
+    strategy = FixedCutlassStrategy()
 
     # Problem setup: Y = X @ W^T, given dY, compute dX and dW
     M, N, K = 32, 64, 128
@@ -460,16 +440,10 @@ def test_complete_backward():
     print("\nCUTLASS computation:")
 
     # Input gradient: dX = dY @ W
-    dX_cutlass = torch.zeros(M, K, dtype=dtype, device=device)
-
-    # For input gradient, we need to handle the special case in execute_cutlass_gemm
-    strategy.execute_cutlass_gemm(dY, W, dX_cutlass, "backward_input")
+    dX_cutlass = strategy.compute_input_gradient(dY, W, "backward_input")
 
     # Weight gradient: dW = dY^T @ X
-    dY_T = dY.t().contiguous()  # [N, M]
-    dW_cutlass = torch.zeros(N, K, dtype=dtype, device=device)
-
-    strategy.execute_cutlass_gemm(dY_T, X, dW_cutlass, "backward_weight")
+    dW_cutlass = strategy.compute_weight_gradient(dY, X, "backward_weight")
 
     print(f"CUTLASS dX: {dX_cutlass.shape}, norm: {dX_cutlass.norm().item():.4f}")
     print(f"CUTLASS dW: {dW_cutlass.shape}, norm: {dW_cutlass.norm().item():.4f}")
@@ -507,7 +481,7 @@ def test_grouped_backward():
     device = torch.device("cuda")
     dtype = torch.bfloat16
 
-    strategy = StandaloneCutlassStrategy()
+    strategy = FixedCutlassStrategy()
 
     # Setup: 2 experts, simple token distribution
     num_experts = 2
@@ -564,23 +538,15 @@ def test_grouped_backward():
 
         print(f"\nExpert {expert_idx}:")
 
-        # Input gradient: dX^T = W^T @ dY^T
-        W_T = expert_W.t().contiguous()
-        dY_T = expert_dY.t().contiguous()
-        dX_T = torch.zeros(in_features, tokens_per_expert, dtype=dtype, device=device)
-
-        strategy.execute_cutlass_gemm(W_T, dY_T, dX_T, f"expert_{expert_idx}_input")
-        dX_cutlass[start_idx:end_idx] = dX_T.t()
-
-        # Weight gradient: dW = dY^T @ X
-        dY_T = expert_dY.t().contiguous()
-        X_T = expert_X.t().contiguous()
-        expert_dW_cutlass = torch.zeros(
-            out_features, in_features, dtype=dtype, device=device
+        # Input gradient: dX = dY @ W
+        expert_dX_cutlass = strategy.compute_input_gradient(
+            expert_dY, expert_W, f"expert_{expert_idx}_input"
         )
+        dX_cutlass[start_idx:end_idx] = expert_dX_cutlass
 
-        strategy.execute_cutlass_gemm(
-            dY_T, X_T, expert_dW_cutlass, f"expert_{expert_idx}_weight"
+        # Weight gradient: dW = dY^T @ X
+        expert_dW_cutlass = strategy.compute_weight_gradient(
+            expert_dY, expert_X, f"expert_{expert_idx}_weight"
         )
         dW_cutlass[expert_idx] = expert_dW_cutlass
 
@@ -610,8 +576,8 @@ def test_grouped_backward():
 
 def main():
     """Main test sequence"""
-    print("🧪 Standalone CUTLASS Backward Test")
-    print("=" * 50)
+    print("🧪 Fixed Standalone CUTLASS Backward Test")
+    print("=" * 55)
 
     if not torch.cuda.is_available():
         print("❌ CUDA not available")

From 59cf49aa6c9ea820327bc933432e23035d303265 Mon Sep 17 00:00:00 2001
From: lessw2020 <lessw@etrillium.com>
Date: Sat, 14 Jun 2025 13:31:57 -0700
Subject: [PATCH 18/34] new benchmarks...gg back not fully working again

---
 .../deepseek_v3/bench_gg_blackwell.py         | 622 +++++++++++++++
 .../deepseek_v3/blackwell_group_gemm.py       | 736 ++++++++++++++++++
 .../deepseek_v3/cute_group_gemm.py            | 733 +++++++++++++++++
 .../deepseek_v3/simple_debug_back.py          | 332 --------
 ...k_standalone.py => simple_test_back_gg.py} |   0
 5 files changed, 2091 insertions(+), 332 deletions(-)
 create mode 100644 torchtitan/experiments/deepseek_v3/bench_gg_blackwell.py
 create mode 100644 torchtitan/experiments/deepseek_v3/blackwell_group_gemm.py
 create mode 100644 torchtitan/experiments/deepseek_v3/cute_group_gemm.py
 delete mode 100644 torchtitan/experiments/deepseek_v3/simple_debug_back.py
 rename torchtitan/experiments/deepseek_v3/{dsl_back_standalone.py => simple_test_back_gg.py} (100%)

diff --git a/torchtitan/experiments/deepseek_v3/bench_gg_blackwell.py b/torchtitan/experiments/deepseek_v3/bench_gg_blackwell.py
new file mode 100644
index 000000000..0100fa4ab
--- /dev/null
+++ b/torchtitan/experiments/deepseek_v3/bench_gg_blackwell.py
@@ -0,0 +1,622 @@
+#!/usr/bin/env python3
+"""
+Comprehensive benchmark: CUTLASS Group GEMM vs PyTorch Manual Looping
+Tests realistic MoE workloads with ~2048 feature dimensions.
+"""
+
+import time
+from typing import List, Tuple
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+# torch.cuda.use_t
+
+# Import triton for benchmarking
+try:
+    from triton.testing import do_bench
+
+    HAS_TRITON = True
+    print("✅ Triton available for benchmarking")
+except ImportError:
+    HAS_TRITON = False
+    print("❌ Triton not available, using basic timing")
+
+
+try:
+    from cute_group_gemm import (
+        create_cutlass_strategy,
+        # CUTLASSGroupedLinear,
+        # CUTLASSGroupGemmStrategy,
+        StrideOptimizedCUTLASSStrategy as CUTLASSGroupGemmStrategy,
+        StrideOptimizedGroupedLinear as CUTLASSGroupedLinear,
+    )
+
+    HAS_CUTLASS = True
+    print("✅ CUTLASS Group GEMM available")
+except ImportError:
+    HAS_CUTLASS = False
+    print("❌ CUTLASS Group GEMM not available")
+    raise ImportError("CUTLASS modules not found. Please update the import paths.")
+
+
+class PyTorchManualGroupedLinear(nn.Module):
+    """Reference PyTorch implementation using manual loops for comparison"""
+
+    def __init__(
+        self,
+        num_experts: int,
+        in_features: int,
+        out_features: int,
+        dtype: torch.dtype = torch.bfloat16,
+    ):
+        super().__init__()
+        self.num_experts = num_experts
+        self.in_features = in_features
+        self.out_features = out_features
+        self.dtype = dtype
+
+        # Same weight initialization as CUTLASS version
+        self.weight = nn.Parameter(
+            torch.empty(num_experts, out_features, in_features, dtype=dtype)
+        )
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        """Initialize parameters to match CUTLASS version"""
+        for expert_idx in range(self.num_experts):
+            nn.init.kaiming_uniform_(self.weight[expert_idx], a=1.41421356)
+
+    def forward(
+        self, input_tokens: torch.Tensor, expert_assignments: torch.Tensor
+    ) -> torch.Tensor:
+        """Manual PyTorch forward pass with explicit loops"""
+        device = input_tokens.device
+        total_tokens, in_features = input_tokens.shape
+        out_features = self.out_features
+
+        # Compute expert sizes and offsets
+        m_sizes, m_offsets = self._compute_expert_sizes_and_offsets(expert_assignments)
+
+        # Sort tokens by expert
+        sorted_indices = torch.argsort(expert_assignments)
+        sorted_tokens = input_tokens[sorted_indices]
+
+        # Initialize output
+        sorted_output = torch.zeros(
+            total_tokens, out_features, dtype=self.dtype, device=device
+        )
+
+        # Manual loop over experts
+        valid_sizes_cpu = m_sizes.cpu().tolist()
+        valid_offsets_cpu = m_offsets.cpu().tolist()
+
+        for expert_idx in range(self.num_experts):
+            size = valid_sizes_cpu[expert_idx]
+            offset = valid_offsets_cpu[expert_idx]
+
+            if size > 0:
+                # Get expert data
+                expert_tokens = sorted_tokens[
+                    offset : offset + size
+                ]  # [size, in_features]
+                expert_weight = self.weight[expert_idx]  # [out_features, in_features]
+
+                # Forward: Y = X @ W^T
+                expert_output = torch.mm(
+                    expert_tokens, expert_weight.t()
+                )  # [size, out_features]
+                sorted_output[offset : offset + size] = expert_output
+
+        # Restore original order
+        output = torch.empty_like(sorted_output)
+        output[sorted_indices] = sorted_output
+
+        return output
+
+    def _compute_expert_sizes_and_offsets(
+        self, expert_assignments: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Compute expert sizes and offsets"""
+        device = expert_assignments.device
+        m_sizes = torch.zeros(self.num_experts, dtype=torch.int32, device=device)
+
+        for expert_idx in range(self.num_experts):
+            m_sizes[expert_idx] = (expert_assignments == expert_idx).sum()
+
+        m_offsets = torch.cat(
+            [torch.tensor([0], device=device), torch.cumsum(m_sizes, dim=0)]
+        )
+
+        return m_sizes, m_offsets
+
+
+class GroupGemmBenchmark:
+    """Comprehensive benchmark suite for Group GEMM implementations"""
+
+    def __init__(self, device="cuda", dtype=torch.bfloat16):
+        self.device = torch.device(device)
+        self.dtype = dtype
+        self.results = []
+
+    def create_test_data(
+        self, num_experts: int, total_tokens: int, in_features: int, out_features: int
+    ):
+        """Create test data for benchmarking"""
+        # Create input tokens
+        input_tokens = torch.randn(
+            total_tokens,
+            in_features,
+            dtype=self.dtype,
+            device=self.device,
+            requires_grad=True,
+        )
+
+        # Create expert assignments (uniform distribution)
+        expert_assignments = torch.randint(
+            0, num_experts, (total_tokens,), device=self.device
+        )
+
+        return input_tokens, expert_assignments
+
+    def benchmark_forward_pass(self, config: dict, warmup: int = 5, rep: int = 20):
+        """Benchmark forward pass performance"""
+        print(f"\n🔍 Benchmarking Forward Pass: {config['name']}")
+        print(
+            f"   {config['num_experts']} experts, {config['total_tokens']} tokens, {config['in_features']}→{config['out_features']}"
+        )
+
+        # Create test data
+        input_tokens, expert_assignments = self.create_test_data(
+            config["num_experts"],
+            config["total_tokens"],
+            config["in_features"],
+            config["out_features"],
+        )
+
+        # Create PyTorch manual implementation
+        pytorch_layer = PyTorchManualGroupedLinear(
+            config["num_experts"],
+            config["in_features"],
+            config["out_features"],
+            self.dtype,
+        ).to(self.device)
+
+        # Create CUTLASS implementation (if available)
+        cutlass_layer = None
+        if HAS_CUTLASS:
+            strategy = create_cutlass_strategy(
+                use_2cta_instrs=False,
+                mma_tiler_mn=(128, 128),
+                cluster_shape_mn=(1, 1),
+            )
+            cutlass_layer = CUTLASSGroupedLinear(
+                config["num_experts"],
+                config["in_features"],
+                config["out_features"],
+                strategy,
+                dtype=self.dtype,
+            ).to(self.device)
+
+            # Copy weights to ensure fair comparison
+            cutlass_layer.weight.data.copy_(pytorch_layer.weight.data)
+
+        # Define benchmark functions
+        def pytorch_forward():
+            return pytorch_layer(input_tokens, expert_assignments)
+
+        def cutlass_forward():
+            if cutlass_layer is not None:
+                return cutlass_layer(input_tokens, expert_assignments)
+            else:
+                return pytorch_forward()  # Fallback
+
+        # Benchmark using triton if available
+        if HAS_TRITON:
+            pytorch_time = do_bench(pytorch_forward, warmup=warmup, rep=rep)
+            cutlass_time = (
+                do_bench(cutlass_forward, warmup=warmup, rep=rep)
+                if HAS_CUTLASS
+                else float("inf")
+            )
+        else:
+            pytorch_time = self._basic_benchmark(pytorch_forward, warmup, rep)
+            cutlass_time = (
+                self._basic_benchmark(cutlass_forward, warmup, rep)
+                if HAS_CUTLASS
+                else float("inf")
+            )
+
+        # Verify numerical correctness
+        if HAS_CUTLASS:
+            with torch.no_grad():
+                pytorch_out = pytorch_forward()
+                cutlass_out = cutlass_forward()
+                max_diff = torch.abs(pytorch_out - cutlass_out).max().item()
+                rel_diff = max_diff / pytorch_out.abs().max().item()
+                correctness = rel_diff < 1e-3
+        else:
+            correctness = False
+            max_diff = float("inf")
+
+        speedup = pytorch_time / cutlass_time if cutlass_time < float("inf") else 0
+
+        result = {
+            "config": config["name"],
+            "operation": "forward",
+            "pytorch_time": pytorch_time,
+            "cutlass_time": cutlass_time,
+            "speedup": speedup,
+            "correctness": correctness,
+            "max_diff": max_diff,
+        }
+
+        print(f"   PyTorch:  {pytorch_time:.2f} ms")
+        print(f"   CUTLASS:  {cutlass_time:.2f} ms")
+        print(f"   Speedup:  {speedup:.2f}x")
+        print(
+            f"   Correct:  {'✅' if correctness else '❌'} (max diff: {max_diff:.2e})"
+        )
+
+        return result
+
+    def benchmark_backward_pass(self, config: dict, warmup: int = 5, rep: int = 20):
+        """Benchmark backward pass performance"""
+        print(f"\n🔍 Benchmarking Backward Pass: {config['name']}")
+        print(
+            f"   {config['num_experts']} experts, {config['total_tokens']} tokens, {config['in_features']}→{config['out_features']}"
+        )
+
+        # Create test data
+        input_tokens, expert_assignments = self.create_test_data(
+            config["num_experts"],
+            config["total_tokens"],
+            config["in_features"],
+            config["out_features"],
+        )
+
+        # Create PyTorch manual implementation
+        pytorch_layer = PyTorchManualGroupedLinear(
+            config["num_experts"],
+            config["in_features"],
+            config["out_features"],
+            self.dtype,
+        ).to(self.device)
+
+        # Create CUTLASS implementation (if available)
+        cutlass_layer = None
+        if HAS_CUTLASS:
+            strategy = create_cutlass_strategy(
+                use_2cta_instrs=False,
+                mma_tiler_mn=(128, 128),
+                cluster_shape_mn=(1, 1),
+            )
+            cutlass_layer = CUTLASSGroupedLinear(
+                config["num_experts"],
+                config["in_features"],
+                config["out_features"],
+                strategy,
+                dtype=self.dtype,
+            ).to(self.device)
+
+            # Copy weights to ensure fair comparison
+            cutlass_layer.weight.data.copy_(pytorch_layer.weight.data)
+
+        # Define benchmark functions
+        def pytorch_backward():
+            input_clone = input_tokens.clone().detach().requires_grad_(True)
+            pytorch_layer.zero_grad()
+            output = pytorch_layer(input_clone, expert_assignments)
+            loss = output.sum()
+            loss.backward()
+            return loss
+
+        def cutlass_backward():
+            if cutlass_layer is not None:
+                input_clone = input_tokens.clone().detach().requires_grad_(True)
+                cutlass_layer.zero_grad()
+                output = cutlass_layer(input_clone, expert_assignments)
+                loss = output.sum()
+                loss.backward()
+                return loss
+            else:
+                return pytorch_backward()  # Fallback
+
+        # Benchmark using triton if available
+        if HAS_TRITON:
+            pytorch_time = do_bench(pytorch_backward, warmup=warmup, rep=rep)
+            cutlass_time = (
+                do_bench(cutlass_backward, warmup=warmup, rep=rep)
+                if HAS_CUTLASS
+                else float("inf")
+            )
+        else:
+            pytorch_time = self._basic_benchmark(pytorch_backward, warmup, rep)
+            cutlass_time = (
+                self._basic_benchmark(cutlass_backward, warmup, rep)
+                if HAS_CUTLASS
+                else float("inf")
+            )
+
+        # Verify gradient correctness
+        if HAS_CUTLASS:
+            # Test gradient correctness
+            input_pytorch = input_tokens.clone().detach().requires_grad_(True)
+            input_cutlass = input_tokens.clone().detach().requires_grad_(True)
+
+            pytorch_layer.zero_grad()
+            cutlass_layer.zero_grad()
+
+            pytorch_out = pytorch_layer(input_pytorch, expert_assignments)
+            cutlass_out = cutlass_layer(input_cutlass, expert_assignments)
+
+            pytorch_out.sum().backward()
+            cutlass_out.sum().backward()
+
+            input_grad_diff = (
+                torch.abs(input_pytorch.grad - input_cutlass.grad).max().item()
+            )
+            weight_grad_diff = (
+                torch.abs(pytorch_layer.weight.grad - cutlass_layer.weight.grad)
+                .max()
+                .item()
+            )
+
+            input_rel_diff = input_grad_diff / input_pytorch.grad.abs().max().item()
+            weight_rel_diff = (
+                weight_grad_diff / pytorch_layer.weight.grad.abs().max().item()
+            )
+
+            correctness = input_rel_diff < 1e-2 and weight_rel_diff < 1e-2
+            max_diff = max(input_grad_diff, weight_grad_diff)
+        else:
+            correctness = False
+            max_diff = float("inf")
+
+        speedup = pytorch_time / cutlass_time if cutlass_time < float("inf") else 0
+
+        result = {
+            "config": config["name"],
+            "operation": "backward",
+            "pytorch_time": pytorch_time,
+            "cutlass_time": cutlass_time,
+            "speedup": speedup,
+            "correctness": correctness,
+            "max_diff": max_diff,
+        }
+
+        print(f"   PyTorch:  {pytorch_time:.2f} ms")
+        print(f"   CUTLASS:  {cutlass_time:.2f} ms")
+        print(f"   Speedup:  {speedup:.2f}x")
+        print(
+            f"   Correct:  {'✅' if correctness else '❌'} (max diff: {max_diff:.2e})"
+        )
+
+        return result
+
+    def _basic_benchmark(self, func, warmup: int, rep: int):
+        """Basic timing fallback when triton is not available"""
+        # Warmup
+        for _ in range(warmup):
+            func()
+        torch.cuda.synchronize()
+
+        # Timing
+        start_time = time.time()
+        for _ in range(rep):
+            func()
+        torch.cuda.synchronize()
+        end_time = time.time()
+
+        return (end_time - start_time) / rep * 1000  # Convert to ms
+
+    def run_comprehensive_benchmark(self):
+        """Run comprehensive benchmarks across different problem sizes"""
+        print("🚀 Comprehensive Group GEMM Benchmark")
+        print("=" * 70)
+        print(f"Device: {self.device}")
+        print(f"Data type: {self.dtype}")
+        print(f"Triton benchmarking: {'✅' if HAS_TRITON else '❌'}")
+        print(f"CUTLASS available: {'✅' if HAS_CUTLASS else '❌'}")
+
+        # Test configurations focused on ~2048 feature dimensions
+        configs = [
+            # Small MoE setups
+            {
+                "name": "Small-4E",
+                "num_experts": 4,
+                "total_tokens": 512,
+                "in_features": 2048,
+                "out_features": 2048,
+            },
+            {
+                "name": "Small-8E",
+                "num_experts": 8,
+                "total_tokens": 1024,
+                "in_features": 2048,
+                "out_features": 2048,
+            },
+            # Medium MoE setups (typical 7B model dimensions)
+            {
+                "name": "MoE-7B-Gate",
+                "num_experts": 8,
+                "total_tokens": 2048,
+                "in_features": 4096,
+                "out_features": 11008,  # Typical MoE up_proj dimension
+            },
+            {
+                "name": "MoE-7B-Down",
+                "num_experts": 8,
+                "total_tokens": 2048,
+                "in_features": 11008,
+                "out_features": 4096,  # Typical MoE down_proj dimension
+            },
+            # Large MoE setups
+            {
+                "name": "Large-16E",
+                "num_experts": 16,
+                "total_tokens": 4096,
+                "in_features": 4096,
+                "out_features": 11008,
+            },
+            {
+                "name": "XLarge-32E",
+                "num_experts": 32,
+                "total_tokens": 4096,
+                "in_features": 4096,
+                "out_features": 11008,
+            },
+            # Very large (DeepSeek-V3 scale)
+            {
+                "name": "DeepSeek-64E",
+                "num_experts": 64,
+                "total_tokens": 8192,
+                "in_features": 7168,  # DeepSeek-V3 dimensions
+                "out_features": 18944,
+            },
+        ]
+
+        all_results = []
+
+        for config in configs:
+            print(f"\n" + "=" * 70)
+            print(f"📊 Configuration: {config['name']}")
+            print(
+                f"   Experts: {config['num_experts']}, Tokens: {config['total_tokens']}"
+            )
+            print(f"   Dimensions: {config['in_features']} → {config['out_features']}")
+            print(
+                f"   Problem size: ~{config['total_tokens'] * config['in_features'] * config['out_features'] / 1e6:.1f}M operations"
+            )
+
+            try:
+                # Benchmark forward pass
+                forward_result = self.benchmark_forward_pass(config)
+                all_results.append(forward_result)
+
+                # Benchmark backward pass
+                backward_result = self.benchmark_backward_pass(config)
+                all_results.append(backward_result)
+
+            except Exception as e:
+                print(f"❌ Error benchmarking {config['name']}: {e}")
+                continue
+
+        # Print summary
+        self.print_benchmark_summary(all_results)
+
+        return all_results
+
+    def print_benchmark_summary(self, results: List[dict]):
+        """Print formatted summary of benchmark results"""
+        print(f"\n" + "=" * 90)
+        print("📈 BENCHMARK SUMMARY")
+        print("=" * 90)
+
+        # Group results by operation type
+        forward_results = [r for r in results if r["operation"] == "forward"]
+        backward_results = [r for r in results if r["operation"] == "backward"]
+
+        def print_operation_summary(op_results: List[dict], operation_name: str):
+            print(f"\n🔍 {operation_name.upper()} PASS RESULTS:")
+            print("-" * 90)
+
+            header = f"{'Config':<15} {'PyTorch (ms)':<12} {'CUTLASS (ms)':<12} {'Speedup':<8} {'Correct':<8} {'Max Diff':<10}"
+            print(header)
+            print("-" * len(header))
+
+            speedups = []
+            for result in op_results:
+                config = result["config"]
+                pytorch_time = result["pytorch_time"]
+                cutlass_time = result["cutlass_time"]
+                speedup = result["speedup"]
+                correctness = "✅" if result["correctness"] else "❌"
+                max_diff = result["max_diff"]
+
+                cutlass_str = (
+                    f"{cutlass_time:.2f}" if cutlass_time < float("inf") else "N/A"
+                )
+                speedup_str = f"{speedup:.2f}x" if speedup > 0 else "N/A"
+                max_diff_str = f"{max_diff:.1e}" if max_diff < float("inf") else "N/A"
+
+                print(
+                    f"{config:<15} {pytorch_time:<12.2f} {cutlass_str:<12} {speedup_str:<8} {correctness:<8} {max_diff_str:<10}"
+                )
+
+                if speedup > 0:
+                    speedups.append(speedup)
+
+            if speedups:
+                avg_speedup = np.mean(speedups)
+                min_speedup = np.min(speedups)
+                max_speedup = np.max(speedups)
+                print(f"\n📊 {operation_name.title()} Speedup Summary:")
+                print(f"   Average: {avg_speedup:.2f}x")
+                print(f"   Range: {min_speedup:.2f}x - {max_speedup:.2f}x")
+
+        # Print summaries for each operation
+        if forward_results:
+            print_operation_summary(forward_results, "forward")
+
+        if backward_results:
+            print_operation_summary(backward_results, "backward")
+
+        # Overall summary
+        print(f"\n" + "=" * 90)
+        print("OVERALL PERFORMANCE ANALYSIS")
+        print("=" * 90)
+
+        all_speedups = [r["speedup"] for r in results if r["speedup"] > 0]
+        if all_speedups:
+            overall_avg = np.mean(all_speedups)
+            print(f"📈 Average speedup across all operations: {overall_avg:.2f}x")
+
+            if overall_avg > 2.0:
+                print(
+                    "🚀 Excellent performance! CUTLASS provides significant acceleration."
+                )
+            elif overall_avg > 1.5:
+                print("✅ Good performance! CUTLASS provides solid acceleration.")
+            elif overall_avg > 1.0:
+                print("⚡ Moderate performance! CUTLASS provides some acceleration.")
+            else:
+                print("⚠️  Limited acceleration. Consider optimizing configuration.")
+        else:
+            print("❌ No speedup data available.")
+
+        # Correctness summary
+        correct_results = [r for r in results if r["correctness"]]
+        total_results = len([r for r in results if r["cutlass_time"] < float("inf")])
+
+        if total_results > 0:
+            correctness_rate = len(correct_results) / total_results * 100
+            print(
+                f"✅ Numerical correctness: {len(correct_results)}/{total_results} ({correctness_rate:.1f}%)"
+            )
+
+
+def main():
+    """Main benchmark entry point"""
+    print("🧪 CUTLASS vs PyTorch Group GEMM Benchmark")
+    print("Focused on realistic MoE workloads with ~2048 feature dimensions")
+    print("=" * 70)
+
+    if not torch.cuda.is_available():
+        print("❌ CUDA not available")
+        return
+
+    print(f"GPU: {torch.cuda.get_device_name()}")
+    print(f"CUDA Version: {torch.version.cuda}")
+    print(f"PyTorch Version: {torch.__version__}")
+
+    # Run comprehensive benchmark
+    benchmark = GroupGemmBenchmark(device="cuda", dtype=torch.bfloat16)
+    results = benchmark.run_comprehensive_benchmark()
+
+    print(f"\n🎉 Benchmark completed! Tested {len(results)} configurations.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/torchtitan/experiments/deepseek_v3/blackwell_group_gemm.py b/torchtitan/experiments/deepseek_v3/blackwell_group_gemm.py
new file mode 100644
index 000000000..cde69ac65
--- /dev/null
+++ b/torchtitan/experiments/deepseek_v3/blackwell_group_gemm.py
@@ -0,0 +1,736 @@
+"""
+Complete Blackwell CUTLASS Group GEMM (Cute DSL) with autograd support.
+
+"""
+
+from typing import Tuple
+
+import torch
+import torch.nn as nn
+
+try:
+    import cuda.bindings.driver as cuda
+    import cutlass
+    import cutlass.cute as cute
+    import cutlass.utils as utils
+    from cutlass.cute.runtime import from_dlpack
+    from torchtitan.experiments.kernels.blackwell.cute_grouped_gemm import (
+        GroupedGemmKernel,
+    )
+
+    HAS_CUTLASS = True
+except ImportError as e:
+    HAS_CUTLASS = False
+    print(f"❌ CUTLASS import failed: {e}")
+
+
+class CUTLASSGroupGemmStrategy:
+    """
+    Production CUTLASS strategy for grouped GEMM operations.
+    Handles both forward and backward passes with proper dimension management.
+    """
+
+    def __init__(
+        self,
+        use_2cta_instrs: bool = False,
+        mma_tiler_mn: Tuple[int, int] = (128, 128),
+        cluster_shape_mn: Tuple[int, int] = (1, 1),
+    ):
+        """
+        Initialize CUTLASS strategy.
+
+        Args:
+            use_2cta_instrs: Whether to use 2-CTA instructions
+            mma_tiler_mn: MMA tile sizes (M, N)
+            cluster_shape_mn: Cluster shape (M, N)
+        """
+        if not HAS_CUTLASS:
+            raise RuntimeError("CUTLASS not available")
+
+        print(f"🔧 Initializing CUTLASS GroupGemm strategy...")
+
+        # Force CUDA context creation
+        dummy = torch.zeros(1, device="cuda")
+        dummy.cpu()
+
+        self.use_2cta_instrs = use_2cta_instrs
+        self.mma_tiler_mn = mma_tiler_mn
+        self.cluster_shape_mn = cluster_shape_mn
+
+        # CUTLASS configuration
+        self.DTYPE_TORCH = torch.bfloat16
+        self.DTYPE_CUTLASS = cutlass.BFloat16
+        self.ACC_DTYPE = cutlass.Float32
+        self.ALIGNMENT = 16
+
+        # Initialize kernel
+        self.grouped_gemm = GroupedGemmKernel(
+            acc_dtype=self.ACC_DTYPE,
+            use_2cta_instrs=use_2cta_instrs,
+            mma_tiler_mn=mma_tiler_mn,
+            cluster_shape_mn=cluster_shape_mn,
+            tensormap_update_mode=utils.TensorMapUpdateMode.SMEM,
+        )
+
+        # Initialize hardware info
+        self.hardware_info = utils.HardwareInfo()
+        self.max_active_clusters = self.hardware_info.get_max_active_clusters(
+            cluster_shape_mn[0] * cluster_shape_mn[1]
+        )
+
+        torch_stream = torch.cuda.current_stream()
+        self.stream = cuda.CUstream(torch_stream.cuda_stream)
+
+        # Caches
+        self._compiled_kernels = {}
+        self._tensormap_buffers = {}
+
+        print(f"✅ Strategy initialized:")
+        print(f"   - 2CTA: {use_2cta_instrs}")
+        print(f"   - MMA tiler: {mma_tiler_mn}")
+        print(f"   - Cluster shape: {cluster_shape_mn}")
+        print(f"   - Max active clusters: {self.max_active_clusters}")
+
+    def _get_tensormap_buffer(self, device):
+        """Get or create tensormap buffer"""
+        if device not in self._tensormap_buffers:
+            sm_count = self.hardware_info.get_max_active_clusters(1)
+            tensormap_tensor = torch.zeros(
+                (sm_count, 3, 128 // 8),
+                dtype=torch.int64,
+                device=device,
+            )
+            self._tensormap_buffers[device] = from_dlpack(
+                tensormap_tensor, assumed_align=self.ALIGNMENT
+            )
+        return self._tensormap_buffers[device]
+
+    def _compute_total_clusters(self, problem_sizes):
+        """Compute total clusters needed"""
+        cluster_tile_m = self.mma_tiler_mn[0]
+        cluster_tile_n = self.mma_tiler_mn[1]
+
+        if self.use_2cta_instrs:
+            cluster_tile_m //= 2
+
+        cluster_tile_m *= self.cluster_shape_mn[0]
+        cluster_tile_n *= self.cluster_shape_mn[1]
+
+        total = 0
+        for M, N, K, L in problem_sizes:
+            clusters_m = (M + cluster_tile_m - 1) // cluster_tile_m
+            clusters_n = (N + cluster_tile_n - 1) // cluster_tile_n
+            total += clusters_m * clusters_n
+        return total
+
+    def _create_initial_tensors(self, problem_shape, device):
+        """Create initial tensors for kernel compilation"""
+        M, N, K, L = problem_shape
+
+        tensors = [
+            torch.randn(M, K, dtype=self.DTYPE_TORCH, device=device),
+            torch.randn(N, K, dtype=self.DTYPE_TORCH, device=device),
+            torch.zeros(M, N, dtype=self.DTYPE_TORCH, device=device),
+        ]
+
+        cute_tensors = []
+        for tensor in tensors:
+            mnkl_tensor = tensor.unsqueeze(-1).contiguous()
+            cute_tensor = from_dlpack(mnkl_tensor, assumed_align=self.ALIGNMENT)
+            cute_tensor.element_type = self.DTYPE_CUTLASS
+            cute_tensor = cute_tensor.mark_layout_dynamic(leading_dim=1)
+            cute_tensors.append(cute_tensor)
+
+        return cute_tensors
+
+    def execute_grouped_gemm(
+        self, A_list, B_list, C_list, operation_name="grouped_gemm"
+    ):
+        """
+        Execute grouped GEMM operations: C_i = A_i @ B_i^T for each i
+
+        Args:
+            A_list: List of A matrices
+            B_list: List of B matrices
+            C_list: List of output C matrices
+            operation_name: Name for debugging
+        """
+        if not A_list or len(A_list) != len(B_list) or len(A_list) != len(C_list):
+            return
+
+        device = A_list[0].device
+
+        # Prepare metadata for all operations
+        problem_sizes = []
+        strides_abc = []
+        ptrs_abc = []
+
+        for A, B, C in zip(A_list, B_list, C_list):
+            M, K = A.shape
+            N, K_B = B.shape
+
+            assert K == K_B, f"Inner dimension mismatch: {K} != {K_B}"
+            assert C.shape == (
+                M,
+                N,
+            ), f"Output shape mismatch: expected ({M}, {N}), got {C.shape}"
+
+            # Ensure contiguous
+            A = A.contiguous()
+            B = B.contiguous()
+            C = C.contiguous()
+
+            L = 1
+
+            # Convert to MNKL format
+            A_mnkl = A.unsqueeze(-1).contiguous()
+            B_mnkl = B.unsqueeze(-1).contiguous()
+            C_mnkl = C.unsqueeze(-1).contiguous()
+
+            # Add to batch
+            problem_sizes.append([M, N, K, L])
+            strides_abc.append(
+                [
+                    list(A_mnkl.stride()[:2]),
+                    list(B_mnkl.stride()[:2]),
+                    list(C_mnkl.stride()[:2]),
+                ]
+            )
+            ptrs_abc.append([A.data_ptr(), B.data_ptr(), C.data_ptr()])
+
+        # Execute grouped kernel
+        self._execute_kernel(
+            problem_sizes, strides_abc, ptrs_abc, device, operation_name
+        )
+
+    def _execute_kernel(
+        self, problem_sizes, strides_abc, ptrs_abc, device, operation_name
+    ):
+        """Execute the CUTLASS grouped kernel"""
+        num_groups = len(problem_sizes)
+
+        # Convert to tensors
+        problem_sizes_tensor = torch.tensor(
+            problem_sizes, dtype=torch.int32, device=device
+        )
+        strides_tensor = torch.tensor(strides_abc, dtype=torch.int32, device=device)
+        ptrs_tensor = torch.tensor(ptrs_abc, dtype=torch.int64, device=device)
+
+        # Convert to CUTE tensors
+        problem_sizes_cute = from_dlpack(
+            problem_sizes_tensor, assumed_align=self.ALIGNMENT
+        )
+        strides_cute = from_dlpack(strides_tensor, assumed_align=self.ALIGNMENT)
+        ptrs_cute = from_dlpack(ptrs_tensor, assumed_align=self.ALIGNMENT)
+
+        # Get buffers
+        tensormap_cute = self._get_tensormap_buffer(device)
+        total_clusters = self._compute_total_clusters(problem_sizes)
+        initial_tensors = self._create_initial_tensors(problem_sizes[0], device)
+
+        # Compile kernel if needed
+        cache_key = (num_groups, total_clusters, tuple(problem_sizes[0][:3]))
+
+        if cache_key not in self._compiled_kernels:
+            self._compiled_kernels[cache_key] = cute.compile(
+                self.grouped_gemm,
+                *initial_tensors,
+                num_groups,
+                problem_sizes_cute,
+                strides_cute,
+                ptrs_cute,
+                total_clusters,
+                tensormap_cute,
+                self.max_active_clusters,
+                self.stream,
+            )
+
+        # Execute
+        compiled_kernel = self._compiled_kernels[cache_key]
+        compiled_kernel(
+            *initial_tensors,
+            problem_sizes_cute,
+            strides_cute,
+            ptrs_cute,
+            tensormap_cute,
+            self.stream,
+        )
+        torch.cuda.synchronize()
+
+
+class CUTLASSGroupGemm(torch.autograd.Function):
+    """
+    PyTorch autograd Function for CUTLASS grouped GEMM.
+
+    Forward: Y_i = X_i @ W_i^T for each expert i
+    Backward:
+        - dX_i = dY_i @ W_i for each expert i
+        - dW_i = dY_i^T @ X_i for each expert i
+    """
+
+    @staticmethod
+    def forward(ctx, input_tokens, weight_stack, m_sizes, m_offsets, strategy):
+        """
+        Forward pass: Y_i = X_i @ W_i^T
+
+        Args:
+            ctx: Autograd context
+            input_tokens: Sorted input tokens [total_tokens, in_features]
+            weight_stack: Expert weights [num_experts, out_features, in_features]
+            m_sizes: Tokens per expert [num_experts]
+            m_offsets: Token offsets [num_experts + 1]
+            strategy: CUTLASSGroupGemmStrategy instance
+        """
+        ctx.save_for_backward(input_tokens, weight_stack, m_sizes, m_offsets)
+        ctx.strategy = strategy
+
+        device = input_tokens.device
+        total_tokens, in_features = input_tokens.shape
+        num_experts, out_features, _ = weight_stack.shape
+
+        # Initialize output
+        output = torch.zeros(
+            total_tokens, out_features, dtype=strategy.DTYPE_TORCH, device=device
+        )
+
+        # Check for valid experts
+        valid_mask = m_sizes > 0
+        valid_indices = torch.nonzero(valid_mask, as_tuple=True)[0]
+
+        if len(valid_indices) == 0:
+            return output
+
+        # Execute forward grouped GEMM
+        CUTLASSGroupGemm._execute_forward_grouped(
+            input_tokens,
+            weight_stack,
+            m_sizes,
+            m_offsets,
+            valid_indices,
+            output,
+            strategy,
+        )
+
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        """
+        Backward pass: compute dX and dW
+
+        Args:
+            ctx: Autograd context with saved tensors
+            grad_output: Upstream gradient [total_tokens, out_features]
+        """
+        input_tokens, weight_stack, m_sizes, m_offsets = ctx.saved_tensors
+        strategy = ctx.strategy
+
+        grad_output = grad_output.contiguous()
+        device = grad_output.device
+
+        # Initialize gradients
+        grad_input = torch.zeros_like(input_tokens)
+        grad_weight = torch.zeros_like(weight_stack)
+
+        # Check for valid experts
+        valid_mask = m_sizes > 0
+        valid_indices = torch.nonzero(valid_mask, as_tuple=True)[0]
+
+        if len(valid_indices) == 0:
+            return grad_input, grad_weight, None, None, None
+
+        # Execute backward grouped operations
+        CUTLASSGroupGemm._execute_backward_grouped(
+            grad_output,
+            input_tokens,
+            weight_stack,
+            m_sizes,
+            m_offsets,
+            valid_indices,
+            grad_input,
+            grad_weight,
+            strategy,
+        )
+
+        return grad_input, grad_weight, None, None, None
+
+    @staticmethod
+    def _execute_forward_grouped(
+        input_tokens, weight_stack, m_sizes, m_offsets, valid_indices, output, strategy
+    ):
+        """Execute grouped forward pass"""
+        # Prepare expert operations
+        A_list = []  # Input matrices
+        B_list = []  # Weight matrices (will be transposed by CUTLASS)
+        C_list = []  # Output matrices
+
+        # Convert to CPU for iteration (minimal sync)
+        valid_sizes = m_sizes[valid_indices].cpu().tolist()
+        valid_offsets = (
+            (
+                m_offsets[valid_indices]
+                if len(m_offsets) > len(valid_indices)
+                else torch.cumsum(
+                    torch.cat(
+                        [
+                            torch.tensor([0], device=input_tokens.device),
+                            m_sizes[valid_indices][:-1],
+                        ]
+                    ),
+                    dim=0,
+                )
+            )
+            .cpu()
+            .tolist()
+        )
+        valid_indices_cpu = valid_indices.cpu().tolist()
+
+        for expert_idx, size, offset in zip(
+            valid_indices_cpu, valid_sizes, valid_offsets
+        ):
+            if size > 0:
+                # Get expert data
+                expert_input = input_tokens[
+                    offset : offset + size
+                ].contiguous()  # [M, K]
+                expert_weight = weight_stack[expert_idx].contiguous()  # [N, K]
+                expert_output = output[offset : offset + size]  # [M, N]
+
+                A_list.append(expert_input)
+                B_list.append(expert_weight)
+                C_list.append(expert_output)
+
+        # Execute grouped GEMM: expert_input @ expert_weight^T
+        strategy.execute_grouped_gemm(A_list, B_list, C_list, "forward")
+
+    @staticmethod
+    def _execute_backward_grouped(
+        grad_output,
+        input_tokens,
+        weight_stack,
+        m_sizes,
+        m_offsets,
+        valid_indices,
+        grad_input,
+        grad_weight,
+        strategy,
+    ):
+        """Execute grouped backward pass"""
+        # Convert to CPU for iteration (minimal sync)
+        valid_sizes = m_sizes[valid_indices].cpu().tolist()
+        valid_offsets = (
+            (
+                m_offsets[valid_indices]
+                if len(m_offsets) > len(valid_indices)
+                else torch.cumsum(
+                    torch.cat(
+                        [
+                            torch.tensor([0], device=grad_output.device),
+                            m_sizes[valid_indices][:-1],
+                        ]
+                    ),
+                    dim=0,
+                )
+            )
+            .cpu()
+            .tolist()
+        )
+        valid_indices_cpu = valid_indices.cpu().tolist()
+
+        # Prepare input gradient operations: dX_i = dY_i @ W_i
+        input_A_list = []  # grad_output matrices
+        input_B_list = []  # weight matrices (transposed for CUTLASS)
+        input_C_list = []  # grad_input matrices
+
+        # Prepare weight gradient operations: dW_i = dY_i^T @ X_i
+        weight_A_list = []  # grad_output^T matrices
+        weight_B_list = []  # input matrices (transposed for CUTLASS)
+        weight_C_list = []  # grad_weight matrices
+
+        for expert_idx, size, offset in zip(
+            valid_indices_cpu, valid_sizes, valid_offsets
+        ):
+            if size > 0:
+                # Get expert data
+                expert_grad_output = grad_output[
+                    offset : offset + size
+                ].contiguous()  # [M, N]
+                expert_input = input_tokens[
+                    offset : offset + size
+                ].contiguous()  # [M, K]
+                expert_weight = weight_stack[expert_idx].contiguous()  # [N, K]
+                expert_grad_input = grad_input[offset : offset + size]  # [M, K]
+                expert_grad_weight = grad_weight[expert_idx]  # [N, K]
+
+                # Input gradient: dX = dY @ W
+                # CUTLASS: dY @ (W^T)^T where W^T is passed as B
+                weight_for_input = expert_weight.t().contiguous()  # [K, N]
+                input_A_list.append(expert_grad_output)
+                input_B_list.append(weight_for_input)
+                input_C_list.append(expert_grad_input)
+
+                # Weight gradient: dW = dY^T @ X
+                # CUTLASS: dY^T @ (X^T)^T where X^T is passed as B
+                grad_output_T = expert_grad_output.t().contiguous()  # [N, M]
+                input_for_weight = expert_input.t().contiguous()  # [K, M]
+                weight_A_list.append(grad_output_T)
+                weight_B_list.append(input_for_weight)
+                weight_C_list.append(expert_grad_weight)
+
+        # Execute grouped operations
+        if input_A_list:
+            strategy.execute_grouped_gemm(
+                input_A_list, input_B_list, input_C_list, "input_gradient"
+            )
+
+        if weight_A_list:
+            strategy.execute_grouped_gemm(
+                weight_A_list, weight_B_list, weight_C_list, "weight_gradient"
+            )
+
+
+class CUTLASSGroupedLinear(nn.Module):
+    """
+    CUTLASS-accelerated grouped linear layer for expert-based models.
+
+    Performs grouped linear transformations Y_i = X_i @ W_i^T for each expert i,
+    with automatic differentiation support for both forward and backward passes.
+
+    Usage:
+        layer = CUTLASSGroupedLinear(
+            num_experts=8,
+            in_features=4096,
+            out_features=11008,
+            strategy=CUTLASSGroupGemmStrategy()
+        )
+
+        output = layer(input_tokens, expert_assignments)
+    """
+
+    def __init__(
+        self,
+        num_experts: int,
+        in_features: int,
+        out_features: int,
+        strategy: CUTLASSGroupGemmStrategy,
+        bias: bool = False,
+        dtype: torch.dtype = torch.bfloat16,
+    ):
+        """
+        Initialize CUTLASS grouped linear layer.
+
+        Args:
+            num_experts: Number of experts
+            in_features: Input feature dimension
+            out_features: Output feature dimension
+            strategy: CUTLASS strategy instance
+            bias: Whether to include bias (not supported yet)
+            dtype: Parameter data type
+        """
+        super().__init__()
+
+        if bias:
+            raise NotImplementedError("Bias not yet supported")
+
+        self.num_experts = num_experts
+        self.in_features = in_features
+        self.out_features = out_features
+        self.strategy = strategy
+        self.dtype = dtype
+
+        # Initialize expert weights [num_experts, out_features, in_features]
+        self.weight = nn.Parameter(
+            torch.empty(num_experts, out_features, in_features, dtype=dtype)
+        )
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        """Initialize parameters using Kaiming uniform initialization"""
+        for expert_idx in range(self.num_experts):
+            nn.init.kaiming_uniform_(self.weight[expert_idx], a=1.41421356)
+
+    def forward(
+        self, input_tokens: torch.Tensor, expert_assignments: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Forward pass through grouped linear layer.
+
+        Args:
+            input_tokens: Input tokens [total_tokens, in_features]
+            expert_assignments: Expert assignment per token [total_tokens]
+
+        Returns:
+            output: Transformed tokens [total_tokens, out_features]
+        """
+        # Compute expert sizes and offsets
+        m_sizes, m_offsets = self._compute_expert_sizes_and_offsets(expert_assignments)
+
+        # Sort tokens by expert assignment for contiguous memory access
+        sorted_indices = torch.argsort(expert_assignments)
+        sorted_tokens = input_tokens[sorted_indices]
+
+        # Apply grouped GEMM
+        sorted_output = CUTLASSGroupGemm.apply(
+            sorted_tokens, self.weight, m_sizes, m_offsets, self.strategy
+        )
+
+        # Restore original token order
+        output = torch.empty_like(sorted_output)
+        output[sorted_indices] = sorted_output
+
+        return output
+
+    def _compute_expert_sizes_and_offsets(
+        self, expert_assignments: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Compute number of tokens per expert and their offsets.
+
+        Args:
+            expert_assignments: Expert assignment per token [total_tokens]
+
+        Returns:
+            m_sizes: Tokens per expert [num_experts]
+            m_offsets: Cumulative token offsets [num_experts + 1]
+        """
+        device = expert_assignments.device
+
+        # Count tokens per expert
+        m_sizes = torch.zeros(self.num_experts, dtype=torch.int32, device=device)
+        for expert_idx in range(self.num_experts):
+            m_sizes[expert_idx] = (expert_assignments == expert_idx).sum()
+
+        # Compute cumulative offsets
+        m_offsets = torch.cat(
+            [torch.tensor([0], device=device), torch.cumsum(m_sizes, dim=0)]
+        )
+
+        return m_sizes, m_offsets
+
+    def extra_repr(self) -> str:
+        """Return string representation of module parameters"""
+        return f"num_experts={self.num_experts}, in_features={self.in_features}, out_features={self.out_features}"
+
+
+def test_cutlass_group_gemm():
+    """Test the complete CUTLASS group GEMM implementation"""
+    print("🧪 Testing Complete CUTLASS Group GEMM")
+    print("=" * 50)
+
+    if not HAS_CUTLASS:
+        print("❌ CUTLASS not available")
+        return False
+
+    device = torch.device("cuda")
+    dtype = torch.bfloat16
+
+    # Test configuration
+    num_experts = 4
+    in_features = 512
+    out_features = 1024
+    total_tokens = 256
+
+    print(
+        f"Configuration: {num_experts} experts, {total_tokens} tokens, {in_features}→{out_features}"
+    )
+
+    # Create strategy and layer
+    strategy = CUTLASSGroupGemmStrategy(
+        use_2cta_instrs=False,
+        mma_tiler_mn=(128, 128),
+        cluster_shape_mn=(1, 1),
+    )
+
+    layer = CUTLASSGroupedLinear(
+        num_experts=num_experts,
+        in_features=in_features,
+        out_features=out_features,
+        strategy=strategy,
+        dtype=dtype,
+    ).to(device)
+
+    # Create test data
+    input_tokens = torch.randn(
+        total_tokens, in_features, dtype=dtype, device=device, requires_grad=True
+    )
+    expert_assignments = torch.randint(0, num_experts, (total_tokens,), device=device)
+
+    print(f"Input: {input_tokens.shape}")
+    print(f"Expert assignments: {expert_assignments.shape}")
+    print(
+        f"Expert distribution: {[torch.sum(expert_assignments == i).item() for i in range(num_experts)]}"
+    )
+
+    try:
+        # Forward pass
+        print("\n🔍 Forward Pass")
+        output = layer(input_tokens, expert_assignments)
+        print(f"Output: {output.shape}, norm: {output.norm().item():.4f}")
+
+        # Backward pass
+        print("\n🔍 Backward Pass")
+        loss = output.sum()
+        loss.backward()
+
+        # Check gradients
+        input_grad_norm = (
+            input_tokens.grad.norm().item() if input_tokens.grad is not None else 0
+        )
+        weight_grad_norm = (
+            layer.weight.grad.norm().item() if layer.weight.grad is not None else 0
+        )
+
+        print(f"Input gradient norm: {input_grad_norm:.4f}")
+        print(f"Weight gradient norm: {weight_grad_norm:.4f}")
+
+        # Validate gradients exist and are reasonable
+        success = (
+            input_tokens.grad is not None
+            and layer.weight.grad is not None
+            and input_grad_norm > 0
+            and weight_grad_norm > 0
+            and torch.isfinite(input_tokens.grad).all()
+            and torch.isfinite(layer.weight.grad).all()
+        )
+
+        if success:
+            print("✅ CUTLASS Group GEMM test passed!")
+        else:
+            print("❌ CUTLASS Group GEMM test failed!")
+
+        return success
+
+    except Exception as e:
+        print(f"❌ Test failed with error: {e}")
+        import traceback
+
+        traceback.print_exc()
+        return False
+
+
+def create_cutlass_strategy(
+    use_2cta_instrs: bool = False,
+    mma_tiler_mn: Tuple[int, int] = (128, 128),
+    cluster_shape_mn: Tuple[int, int] = (1, 1),
+) -> CUTLASSGroupGemmStrategy:
+    """
+    Convenience function to create a CUTLASS strategy.
+
+    Args:
+        use_2cta_instrs: Whether to use 2-CTA instructions
+        mma_tiler_mn: MMA tile sizes
+        cluster_shape_mn: Cluster shape
+
+    Returns:
+        Configured CUTLASS strategy
+    """
+    return CUTLASSGroupGemmStrategy(
+        use_2cta_instrs=use_2cta_instrs,
+        mma_tiler_mn=mma_tiler_mn,
+        cluster_shape_mn=cluster_shape_mn,
+    )
+
+
+if __name__ == "__main__":
+    test_cutlass_group_gemm()
diff --git a/torchtitan/experiments/deepseek_v3/cute_group_gemm.py b/torchtitan/experiments/deepseek_v3/cute_group_gemm.py
new file mode 100644
index 000000000..cafe151d1
--- /dev/null
+++ b/torchtitan/experiments/deepseek_v3/cute_group_gemm.py
@@ -0,0 +1,733 @@
+"""
+Stride-optimized CUTLASS Group GEMM implementation.
+Uses stride manipulation instead of tensor transpositions for better performance.
+"""
+
+from typing import List, Tuple
+
+import torch
+import torch.nn as nn
+
+try:
+    import cuda.bindings.driver as cuda
+    import cutlass
+    import cutlass.cute as cute
+    import cutlass.utils as utils
+    from blackwell_group_gemm import GroupedGemmKernel
+    from cutlass.cute.runtime import from_dlpack
+
+    HAS_CUTLASS = True
+except ImportError as e:
+    HAS_CUTLASS = False
+    print(f"❌ CUTLASS import failed: {e}")
+
+
+class StrideOptimizedCUTLASSStrategy:
+    """
+    Stride-optimized CUTLASS strategy that avoids tensor transpositions.
+    Uses stride manipulation to achieve transpose effects without data movement.
+    """
+
+    def __init__(
+        self,
+        use_2cta_instrs: bool = False,
+        mma_tiler_mn: Tuple[int, int] = (128, 128),
+        cluster_shape_mn: Tuple[int, int] = (1, 1),
+    ):
+        """Initialize stride-optimized CUTLASS strategy"""
+        if not HAS_CUTLASS:
+            raise RuntimeError("CUTLASS not available")
+
+        print(f"🚀 Initializing Stride-Optimized CUTLASS strategy...")
+
+        # Force CUDA context creation
+        dummy = torch.zeros(1, device="cuda")
+        dummy.cpu()
+
+        self.use_2cta_instrs = use_2cta_instrs
+        self.mma_tiler_mn = mma_tiler_mn
+        self.cluster_shape_mn = cluster_shape_mn
+
+        # CUTLASS configuration
+        self.DTYPE_TORCH = torch.bfloat16
+        self.DTYPE_CUTLASS = cutlass.BFloat16
+        self.ACC_DTYPE = cutlass.Float32
+        self.ALIGNMENT = 16
+
+        # Initialize kernel
+        self.grouped_gemm = GroupedGemmKernel(
+            acc_dtype=self.ACC_DTYPE,
+            use_2cta_instrs=use_2cta_instrs,
+            mma_tiler_mn=mma_tiler_mn,
+            cluster_shape_mn=cluster_shape_mn,
+            tensormap_update_mode=utils.TensorMapUpdateMode.SMEM,
+        )
+
+        # Initialize hardware info
+        self.hardware_info = utils.HardwareInfo()
+        self.max_active_clusters = self.hardware_info.get_max_active_clusters(
+            cluster_shape_mn[0] * cluster_shape_mn[1]
+        )
+
+        torch_stream = torch.cuda.current_stream()
+        self.stream = cuda.CUstream(torch_stream.cuda_stream)
+
+        # Caches
+        self._compiled_kernels = {}
+        self._tensormap_buffers = {}
+
+        print(f"✅ Stride-optimized strategy initialized:")
+        print(f"   - Zero-copy transpose operations")
+        print(f"   - Stride-based layout manipulation")
+        print(f"   - Max active clusters: {self.max_active_clusters}")
+
+    def _get_tensormap_buffer(self, device):
+        """Get or create tensormap buffer"""
+        if device not in self._tensormap_buffers:
+            sm_count = self.hardware_info.get_max_active_clusters(1)
+            tensormap_tensor = torch.zeros(
+                (sm_count, 3, 128 // 8),
+                dtype=torch.int64,
+                device=device,
+            )
+            self._tensormap_buffers[device] = from_dlpack(
+                tensormap_tensor, assumed_align=self.ALIGNMENT
+            )
+        return self._tensormap_buffers[device]
+
+    def _compute_total_clusters(self, problem_sizes):
+        """Compute total clusters needed"""
+        cluster_tile_m = self.mma_tiler_mn[0]
+        cluster_tile_n = self.mma_tiler_mn[1]
+
+        if self.use_2cta_instrs:
+            cluster_tile_m //= 2
+
+        cluster_tile_m *= self.cluster_shape_mn[0]
+        cluster_tile_n *= self.cluster_shape_mn[1]
+
+        total = 0
+        for M, N, K, L in problem_sizes:
+            clusters_m = (M + cluster_tile_m - 1) // cluster_tile_m
+            clusters_n = (N + cluster_tile_n - 1) // cluster_tile_n
+            total += clusters_m * clusters_n
+        return total
+
+    def _create_initial_tensors(self, problem_shape, device):
+        """Create initial tensors for kernel compilation"""
+        M, N, K, L = problem_shape
+
+        tensors = [
+            torch.randn(M, K, dtype=self.DTYPE_TORCH, device=device),
+            torch.randn(N, K, dtype=self.DTYPE_TORCH, device=device),
+            torch.zeros(M, N, dtype=self.DTYPE_TORCH, device=device),
+        ]
+
+        cute_tensors = []
+        for tensor in tensors:
+            mnkl_tensor = tensor.unsqueeze(-1).contiguous()
+            cute_tensor = from_dlpack(mnkl_tensor, assumed_align=self.ALIGNMENT)
+            cute_tensor.element_type = self.DTYPE_CUTLASS
+            cute_tensor = cute_tensor.mark_layout_dynamic(leading_dim=1)
+            cute_tensors.append(cute_tensor)
+
+        return cute_tensors
+
+    def _get_transpose_strides(self, tensor: torch.Tensor) -> List[int]:
+        """
+        Get strides for transposed tensor without actually transposing.
+
+        Args:
+            tensor: Original tensor [M, N]
+
+        Returns:
+            Transposed strides that make CUTLASS interpret data as [N, M]
+        """
+        original_strides = tensor.stride()
+        # For transpose: [M, N] -> [N, M], swap the strides
+        return [original_strides[1], original_strides[0]]
+
+    def execute_stride_grouped_gemm(
+        self, operations: List[dict], operation_name="stride_gemm"
+    ):
+        """
+        Execute grouped GEMM with stride-based layout control.
+
+        Args:
+            operations: List of operation dictionaries with keys:
+                - 'A': tensor A
+                - 'B': tensor B
+                - 'C': output tensor C
+                - 'transpose_A': bool, whether to logically transpose A
+                - 'transpose_B': bool, whether to logically transpose B
+            operation_name: Name for debugging
+        """
+        if not operations:
+            return
+
+        device = operations[0]["A"].device
+
+        # Prepare metadata for all operations using stride manipulation
+        problem_sizes = []
+        strides_abc = []
+        ptrs_abc = []
+
+        for op in operations:
+            A = op["A"].contiguous()  # Ensure contiguous for safe stride manipulation
+            B = op["B"].contiguous()
+            C = op["C"].contiguous()
+            transpose_A = op.get("transpose_A", False)
+            transpose_B = op.get("transpose_B", False)
+
+            # Get logical shapes after transpose
+            if transpose_A:
+                M, K = A.shape[1], A.shape[0]  # Logical shape after transpose
+                A_strides = self._get_transpose_strides(A)
+            else:
+                M, K = A.shape
+                A_strides = list(A.stride())
+
+            if transpose_B:
+                N, K_B = B.shape[0], B.shape[1]  # B^T shape
+                # For CUTLASS B^T operation, we need to handle this specially
+                # CUTLASS will transpose B, so if we want B^T, we pass original B
+                B_strides = list(B.stride())
+            else:
+                K_B, N = B.shape
+                # For CUTLASS B^T operation with no logical transpose
+                # We need to pass B with swapped strides to get B^T effect
+                B_strides = self._get_transpose_strides(B)
+
+            # Validate dimensions
+            assert K == K_B, f"Inner dimension mismatch: {K} != {K_B}"
+            assert C.shape == (
+                M,
+                N,
+            ), f"Output shape mismatch: expected ({M}, {N}), got {C.shape}"
+
+            L = 1
+            C_strides = list(C.stride())
+
+            # Add to batch
+            problem_sizes.append([M, N, K, L])
+            strides_abc.append([A_strides, B_strides, C_strides])
+            ptrs_abc.append([A.data_ptr(), B.data_ptr(), C.data_ptr()])
+
+        # Execute grouped kernel
+        self._execute_kernel(
+            problem_sizes, strides_abc, ptrs_abc, device, operation_name
+        )
+
+    def _execute_kernel(
+        self, problem_sizes, strides_abc, ptrs_abc, device, operation_name
+    ):
+        """Execute the CUTLASS grouped kernel"""
+        num_groups = len(problem_sizes)
+
+        # Convert to tensors
+        problem_sizes_tensor = torch.tensor(
+            problem_sizes, dtype=torch.int32, device=device
+        )
+        strides_tensor = torch.tensor(strides_abc, dtype=torch.int32, device=device)
+        ptrs_tensor = torch.tensor(ptrs_abc, dtype=torch.int64, device=device)
+
+        # Convert to CUTE tensors
+        problem_sizes_cute = from_dlpack(
+            problem_sizes_tensor, assumed_align=self.ALIGNMENT
+        )
+        strides_cute = from_dlpack(strides_tensor, assumed_align=self.ALIGNMENT)
+        ptrs_cute = from_dlpack(ptrs_tensor, assumed_align=self.ALIGNMENT)
+
+        # Get buffers
+        tensormap_cute = self._get_tensormap_buffer(device)
+        total_clusters = self._compute_total_clusters(problem_sizes)
+        initial_tensors = self._create_initial_tensors(problem_sizes[0], device)
+
+        # Compile kernel if needed
+        cache_key = (num_groups, total_clusters, tuple(problem_sizes[0][:3]))
+
+        if cache_key not in self._compiled_kernels:
+            self._compiled_kernels[cache_key] = cute.compile(
+                self.grouped_gemm,
+                *initial_tensors,
+                num_groups,
+                problem_sizes_cute,
+                strides_cute,
+                ptrs_cute,
+                total_clusters,
+                tensormap_cute,
+                self.max_active_clusters,
+                self.stream,
+            )
+
+        # Execute
+        compiled_kernel = self._compiled_kernels[cache_key]
+        compiled_kernel(
+            *initial_tensors,
+            problem_sizes_cute,
+            strides_cute,
+            ptrs_cute,
+            tensormap_cute,
+            self.stream,
+        )
+        torch.cuda.synchronize()
+
+
+class StrideOptimizedGroupGemm(torch.autograd.Function):
+    """
+    Stride-optimized CUTLASS grouped GEMM autograd function.
+    Avoids tensor transpositions by using stride manipulation.
+    """
+
+    @staticmethod
+    def forward(ctx, input_tokens, weight_stack, m_sizes, m_offsets, strategy):
+        """Forward pass using stride-optimized operations"""
+        ctx.save_for_backward(input_tokens, weight_stack, m_sizes, m_offsets)
+        ctx.strategy = strategy
+
+        device = input_tokens.device
+        total_tokens, in_features = input_tokens.shape
+        num_experts, out_features, _ = weight_stack.shape
+
+        # Initialize output
+        output = torch.zeros(
+            total_tokens, out_features, dtype=strategy.DTYPE_TORCH, device=device
+        )
+
+        # Check for valid experts
+        valid_mask = m_sizes > 0
+        valid_indices = torch.nonzero(valid_mask, as_tuple=True)[0]
+
+        if len(valid_indices) == 0:
+            return output
+
+        # Execute forward grouped GEMM using stride optimization
+        StrideOptimizedGroupGemm._execute_forward_stride_optimized(
+            input_tokens,
+            weight_stack,
+            m_sizes,
+            m_offsets,
+            valid_indices,
+            output,
+            strategy,
+        )
+
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        """Backward pass using stride-optimized operations"""
+        input_tokens, weight_stack, m_sizes, m_offsets = ctx.saved_tensors
+        strategy = ctx.strategy
+
+        grad_output = grad_output.contiguous()
+        device = grad_output.device
+
+        # Initialize gradients
+        grad_input = torch.zeros_like(input_tokens)
+        grad_weight = torch.zeros_like(weight_stack)
+
+        # Check for valid experts
+        valid_mask = m_sizes > 0
+        valid_indices = torch.nonzero(valid_mask, as_tuple=True)[0]
+
+        if len(valid_indices) == 0:
+            return grad_input, grad_weight, None, None, None
+
+        # Execute backward grouped operations using stride optimization
+        StrideOptimizedGroupGemm._execute_backward_stride_optimized(
+            grad_output,
+            input_tokens,
+            weight_stack,
+            m_sizes,
+            m_offsets,
+            valid_indices,
+            grad_input,
+            grad_weight,
+            strategy,
+        )
+
+        return grad_input, grad_weight, None, None, None
+
+    @staticmethod
+    def _execute_forward_stride_optimized(
+        input_tokens, weight_stack, m_sizes, m_offsets, valid_indices, output, strategy
+    ):
+        """Execute forward pass with stride optimization"""
+        # Prepare stride-optimized operations
+        operations = []
+
+        # Convert to CPU for iteration (minimal sync)
+        valid_sizes = m_sizes[valid_indices].cpu().tolist()
+        valid_offsets = (
+            (
+                m_offsets[valid_indices]
+                if len(m_offsets) > len(valid_indices)
+                else torch.cumsum(
+                    torch.cat(
+                        [
+                            torch.tensor([0], device=input_tokens.device),
+                            m_sizes[valid_indices][:-1],
+                        ]
+                    ),
+                    dim=0,
+                )
+            )
+            .cpu()
+            .tolist()
+        )
+        valid_indices_cpu = valid_indices.cpu().tolist()
+
+        for expert_idx, size, offset in zip(
+            valid_indices_cpu, valid_sizes, valid_offsets
+        ):
+            if size > 0:
+                # Get expert data (all contiguous, no transpose needed)
+                expert_input = input_tokens[
+                    offset : offset + size
+                ].contiguous()  # [M, K]
+                expert_weight = weight_stack[expert_idx].contiguous()  # [N, K]
+                expert_output = output[offset : offset + size]  # [M, N]
+
+                # Forward: expert_input @ expert_weight^T
+                # A = expert_input [M, K], B = expert_weight [N, K]
+                # CUTLASS computes A @ B^T = expert_input @ expert_weight^T ✅
+                operations.append(
+                    {
+                        "A": expert_input,
+                        "B": expert_weight,
+                        "C": expert_output,
+                        "transpose_A": False,  # No transpose needed
+                        "transpose_B": True,  # CUTLASS will transpose B automatically
+                    }
+                )
+
+        # Execute all operations in one grouped call
+        strategy.execute_stride_grouped_gemm(operations, "forward_stride_opt")
+
+    @staticmethod
+    def _execute_backward_stride_optimized(
+        grad_output,
+        input_tokens,
+        weight_stack,
+        m_sizes,
+        m_offsets,
+        valid_indices,
+        grad_input,
+        grad_weight,
+        strategy,
+    ):
+        """Execute backward pass with stride optimization"""
+        # Convert to CPU for iteration (minimal sync)
+        valid_sizes = m_sizes[valid_indices].cpu().tolist()
+        valid_offsets = (
+            (
+                m_offsets[valid_indices]
+                if len(m_offsets) > len(valid_indices)
+                else torch.cumsum(
+                    torch.cat(
+                        [
+                            torch.tensor([0], device=grad_output.device),
+                            m_sizes[valid_indices][:-1],
+                        ]
+                    ),
+                    dim=0,
+                )
+            )
+            .cpu()
+            .tolist()
+        )
+        valid_indices_cpu = valid_indices.cpu().tolist()
+
+        # Prepare input gradient operations: dX = dY @ W
+        input_operations = []
+        # Prepare weight gradient operations: dW = dY^T @ X
+        weight_operations = []
+
+        for expert_idx, size, offset in zip(
+            valid_indices_cpu, valid_sizes, valid_offsets
+        ):
+            if size > 0:
+                # Get expert data (all contiguous)
+                expert_grad_output = grad_output[
+                    offset : offset + size
+                ].contiguous()  # [M, N]
+                expert_input = input_tokens[
+                    offset : offset + size
+                ].contiguous()  # [M, K]
+                expert_weight = weight_stack[expert_idx].contiguous()  # [N, K]
+                expert_grad_input = grad_input[offset : offset + size]  # [M, K]
+                expert_grad_weight = grad_weight[expert_idx]  # [N, K]
+
+                # Input gradient: dX = dY @ W
+                # We need: grad_output[M,N] @ weight[N,K] = grad_input[M,K]
+                # CUTLASS: A @ B^T, so we need B^T = weight^T = [K,N]
+                # Use stride manipulation: tell CUTLASS to interpret weight as [K,N]
+                input_operations.append(
+                    {
+                        "A": expert_grad_output,  # [M, N]
+                        "B": expert_weight,  # [N, K] - will be stride-interpreted as [K, N]
+                        "C": expert_grad_input,  # [M, K]
+                        "transpose_A": False,
+                        "transpose_B": False,  # Use stride manipulation instead of transpose
+                    }
+                )
+
+                # Weight gradient: dW = dY^T @ X
+                # We need: grad_output^T[N,M] @ input[M,K] = grad_weight[N,K]
+                # CUTLASS: A @ B^T, so A = grad_output^T[N,M], B^T = input^T[K,M]
+                # Use stride manipulation for both A transpose and B transpose
+                weight_operations.append(
+                    {
+                        "A": expert_grad_output,  # [M, N] - will be stride-interpreted as [N, M]
+                        "B": expert_input,  # [M, K] - will be stride-interpreted as [K, M]
+                        "C": expert_grad_weight,  # [N, K]
+                        "transpose_A": True,  # Use stride manipulation for transpose
+                        "transpose_B": False,  # CUTLASS handles B^T
+                    }
+                )
+
+        # Execute grouped operations
+        if input_operations:
+            strategy.execute_stride_grouped_gemm(
+                input_operations, "input_grad_stride_opt"
+            )
+
+        if weight_operations:
+            strategy.execute_stride_grouped_gemm(
+                weight_operations, "weight_grad_stride_opt"
+            )
+
+
+class StrideOptimizedGroupedLinear(nn.Module):
+    """
+    Stride-optimized CUTLASS grouped linear layer.
+    Provides significant performance improvements by avoiding tensor transpositions.
+    """
+
+    def __init__(
+        self,
+        num_experts: int,
+        in_features: int,
+        out_features: int,
+        strategy: StrideOptimizedCUTLASSStrategy,
+        bias: bool = False,
+        dtype: torch.dtype = torch.bfloat16,
+    ):
+        """Initialize stride-optimized grouped linear layer"""
+        super().__init__()
+
+        if bias:
+            raise NotImplementedError("Bias not yet supported")
+
+        self.num_experts = num_experts
+        self.in_features = in_features
+        self.out_features = out_features
+        self.strategy = strategy
+        self.dtype = dtype
+
+        # Initialize expert weights
+        self.weight = nn.Parameter(
+            torch.empty(num_experts, out_features, in_features, dtype=dtype)
+        )
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        """Initialize parameters using Kaiming uniform initialization"""
+        for expert_idx in range(self.num_experts):
+            nn.init.kaiming_uniform_(self.weight[expert_idx], a=1.41421356)
+
+    def forward(
+        self, input_tokens: torch.Tensor, expert_assignments: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Stride-optimized forward pass.
+
+        Args:
+            input_tokens: Input tokens [total_tokens, in_features]
+            expert_assignments: Expert assignment per token [total_tokens]
+
+        Returns:
+            output: Transformed tokens [total_tokens, out_features]
+        """
+        # Compute expert sizes and offsets
+        m_sizes, m_offsets = self._compute_expert_sizes_and_offsets(expert_assignments)
+
+        # Sort tokens by expert assignment for contiguous memory access
+        sorted_indices = torch.argsort(expert_assignments)
+        sorted_tokens = input_tokens[sorted_indices]
+
+        # Apply stride-optimized grouped GEMM (no transpositions!)
+        sorted_output = StrideOptimizedGroupGemm.apply(
+            sorted_tokens, self.weight, m_sizes, m_offsets, self.strategy
+        )
+
+        # Restore original token order
+        output = torch.empty_like(sorted_output)
+        output[sorted_indices] = sorted_output
+
+        return output
+
+    def _compute_expert_sizes_and_offsets(
+        self, expert_assignments: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Compute number of tokens per expert and their offsets"""
+        device = expert_assignments.device
+
+        # Count tokens per expert
+        m_sizes = torch.zeros(self.num_experts, dtype=torch.int32, device=device)
+        for expert_idx in range(self.num_experts):
+            m_sizes[expert_idx] = (expert_assignments == expert_idx).sum()
+
+        # Compute cumulative offsets
+        m_offsets = torch.cat(
+            [torch.tensor([0], device=device), torch.cumsum(m_sizes, dim=0)]
+        )
+
+        return m_sizes, m_offsets
+
+    def extra_repr(self) -> str:
+        """Return string representation"""
+        return f"num_experts={self.num_experts}, in_features={self.in_features}, out_features={self.out_features}, stride_optimized=True"
+
+
+def create_stride_optimized_strategy(
+    use_2cta_instrs: bool = False,
+    mma_tiler_mn: Tuple[int, int] = (128, 128),
+    cluster_shape_mn: Tuple[int, int] = (1, 1),
+) -> StrideOptimizedCUTLASSStrategy:
+    """Create a stride-optimized CUTLASS strategy"""
+    return StrideOptimizedCUTLASSStrategy(
+        use_2cta_instrs=use_2cta_instrs,
+        mma_tiler_mn=mma_tiler_mn,
+        cluster_shape_mn=cluster_shape_mn,
+    )
+
+
+def test_stride_optimization():
+    """Test stride-optimized vs regular implementation"""
+    print("🧪 Testing Stride-Optimized CUTLASS Group GEMM")
+    print("=" * 60)
+
+    if not HAS_CUTLASS:
+        print("❌ CUTLASS not available")
+        return False
+
+    device = torch.device("cuda")
+    dtype = torch.bfloat16
+
+    # Test configuration
+    num_experts = 8
+    in_features = 2048
+    out_features = 4096
+    total_tokens = 1024
+
+    print(
+        f"Configuration: {num_experts} experts, {total_tokens} tokens, {in_features}→{out_features}"
+    )
+
+    # Create stride-optimized strategy and layer
+    stride_strategy = create_stride_optimized_strategy(
+        use_2cta_instrs=False,
+        mma_tiler_mn=(128, 128),
+        cluster_shape_mn=(1, 1),
+    )
+
+    stride_layer = StrideOptimizedGroupedLinear(
+        num_experts=num_experts,
+        in_features=in_features,
+        out_features=out_features,
+        strategy=stride_strategy,
+        dtype=dtype,
+    ).to(device)
+
+    # Create test data
+    input_tokens = torch.randn(
+        total_tokens, in_features, dtype=dtype, device=device, requires_grad=True
+    )
+    expert_assignments = torch.randint(0, num_experts, (total_tokens,), device=device)
+
+    print(
+        f"Expert distribution: {[torch.sum(expert_assignments == i).item() for i in range(num_experts)]}"
+    )
+
+    try:
+        # Forward pass
+        print("\n🔍 Forward Pass (Zero-Copy Transpose)")
+        output = stride_layer(input_tokens, expert_assignments)
+        print(f"Output: {output.shape}, norm: {output.norm().item():.4f}")
+
+        # Backward pass
+        print("\n🔍 Backward Pass (Stride-Based Gradients)")
+        loss = output.sum()
+        loss.backward()
+
+        # Check gradients
+        input_grad_norm = (
+            input_tokens.grad.norm().item() if input_tokens.grad is not None else 0
+        )
+        weight_grad_norm = (
+            stride_layer.weight.grad.norm().item()
+            if stride_layer.weight.grad is not None
+            else 0
+        )
+
+        print(f"Input gradient norm: {input_grad_norm:.4f}")
+        print(f"Weight gradient norm: {weight_grad_norm:.4f}")
+
+        # Validate gradients exist and are reasonable
+        success = (
+            input_tokens.grad is not None
+            and stride_layer.weight.grad is not None
+            and input_grad_norm > 0
+            and weight_grad_norm > 0
+            and torch.isfinite(input_tokens.grad).all()
+            and torch.isfinite(stride_layer.weight.grad).all()
+        )
+
+        if success:
+            print("✅ Stride-optimized CUTLASS Group GEMM test passed!")
+            print("\n💡 Performance benefits:")
+            print("   - Zero tensor transpositions")
+            print("   - No memory copying for layout changes")
+            print("   - Reduced memory bandwidth usage")
+            print("   - Better cache efficiency")
+        else:
+            print("❌ Stride-optimized test failed!")
+
+        return success
+
+    except Exception as e:
+        print(f"❌ Test failed with error: {e}")
+        import traceback
+
+        traceback.print_exc()
+        return False
+
+
+def create_cutlass_strategy(
+    use_2cta_instrs: bool = False,
+    mma_tiler_mn: Tuple[int, int] = (128, 128),
+    cluster_shape_mn: Tuple[int, int] = (1, 1),
+) -> StrideOptimizedCUTLASSStrategy:
+    """
+    Convenience function to create a CUTLASS strategy.
+
+    Args:
+        use_2cta_instrs: Whether to use 2-CTA instructions
+        mma_tiler_mn: MMA tile sizes
+        cluster_shape_mn: Cluster shape
+
+    Returns:
+        Configured CUTLASS strategy
+    """
+    return StrideOptimizedCUTLASSStrategy(
+        use_2cta_instrs=use_2cta_instrs,
+        mma_tiler_mn=mma_tiler_mn,
+        cluster_shape_mn=cluster_shape_mn,
+    )
+
+
+if __name__ == "__main__":
+    test_stride_optimization()
diff --git a/torchtitan/experiments/deepseek_v3/simple_debug_back.py b/torchtitan/experiments/deepseek_v3/simple_debug_back.py
deleted file mode 100644
index 10e95a5b4..000000000
--- a/torchtitan/experiments/deepseek_v3/simple_debug_back.py
+++ /dev/null
@@ -1,332 +0,0 @@
-#!/usr/bin/env python3
-"""
-Simple debug script to isolate CUTLASS backward pass issues.
-Run this to identify exactly where the numerical problems occur.
-"""
-
-import numpy as np
-import torch
-import torch.nn as nn
-
-
-def test_single_expert_operations():
-    """Test operations on a single expert to isolate issues"""
-    print("🔍 Testing Single Expert Operations")
-    print("=" * 50)
-
-    device = torch.device("cuda")
-    dtype = torch.bfloat16
-
-    # Simple test case
-    M, N, K = 32, 64, 128  # Small sizes for debugging
-
-    # Create test data
-    X = torch.randn(M, K, dtype=dtype, device=device, requires_grad=True)  # Input
-    W = torch.randn(N, K, dtype=dtype, device=device, requires_grad=True)  # Weight
-
-    print(f"Input X: {X.shape}")
-    print(f"Weight W: {W.shape}")
-
-    # Forward pass: Y = X @ W^T
-    Y = torch.mm(X, W.t())  # [M, N]
-    print(f"Output Y: {Y.shape}")
-
-    # Create upstream gradient
-    dY = torch.randn_like(Y)
-    print(f"Upstream grad dY: {dY.shape}")
-
-    # Compute reference gradients
-    print("\n📊 Reference PyTorch Gradients:")
-    Y_ref = torch.mm(X, W.t())
-    Y_ref.backward(dY, retain_graph=True)
-
-    dX_ref = X.grad.clone()
-    dW_ref = W.grad.clone()
-
-    print(f"dX_ref norm: {dX_ref.norm().item():.4f}")
-    print(f"dW_ref norm: {dW_ref.norm().item():.4f}")
-
-    # Clear gradients
-    X.grad = None
-    W.grad = None
-
-    # Manual gradient computation
-    print("\n🧮 Manual Gradient Computation:")
-    dX_manual = torch.mm(dY, W)  # [M, N] @ [N, K] = [M, K]
-    dW_manual = torch.mm(dY.t(), X)  # [N, M] @ [M, K] = [N, K]
-
-    print(f"dX_manual norm: {dX_manual.norm().item():.4f}")
-    print(f"dW_manual norm: {dW_manual.norm().item():.4f}")
-
-    # Check manual vs reference
-    dX_diff = torch.abs(dX_manual - dX_ref).max().item()
-    dW_diff = torch.abs(dW_manual - dW_ref).max().item()
-
-    print(f"\n✅ Manual vs Reference:")
-    print(f"dX difference: {dX_diff:.2e}")
-    print(f"dW difference: {dW_diff:.2e}")
-
-    if dX_diff < 1e-3 and dW_diff < 1e-3:
-        print("✅ Manual gradients match reference!")
-    else:
-        print("❌ Manual gradients don't match!")
-        return False
-
-    return dX_manual, dW_manual, X, W, dY
-
-
-def test_cutlass_simple_operations():
-    """Test CUTLASS operations step by step"""
-    print("\n🔍 Testing CUTLASS Simple Operations")
-    print("=" * 50)
-
-    try:
-        from cutlass_backwards_debug import (
-            CUTLASSBackwardGroupGemmDebug,
-            CUTLASSGroupedGemmStrategyDebug,
-        )
-    except ImportError:
-        print("❌ Cannot import debug modules")
-        return False
-
-    # Get reference data from single expert test
-    dX_ref, dW_ref, X, W, dY = test_single_expert_operations()
-
-    device = X.device
-    dtype = X.dtype
-
-    # Create debug strategy
-    strategy = CUTLASSGroupedGemmStrategyDebug(
-        debug_mode=True,
-        backward_method="approach_3",  # Single expert debugging
-        use_2cta_instrs=False,
-        mma_tiler_mn=(128, 128),
-        cluster_shape_mn=(1, 1),
-    )
-
-    print(f"\n🔧 Testing CUTLASS Single Expert Operations:")
-
-    # Test single expert CUTLASS operations
-    try:
-        dX_cutlass, dW_cutlass = (
-            CUTLASSBackwardGroupGemmDebug._test_single_expert_cutlass(
-                dY, X, W, strategy
-            )
-        )
-
-        # Compare with reference
-        dX_cutlass_diff = torch.abs(dX_cutlass - dX_ref).max().item()
-        dW_cutlass_diff = torch.abs(dW_cutlass - dW_ref).max().item()
-
-        print(f"\n📊 CUTLASS vs Reference:")
-        print(f"dX difference: {dX_cutlass_diff:.2e}")
-        print(f"dW difference: {dW_cutlass_diff:.2e}")
-
-        if dX_cutlass_diff < 1e-2 and dW_cutlass_diff < 1e-2:
-            print("✅ CUTLASS single expert operations working!")
-            return True
-        else:
-            print("❌ CUTLASS single expert operations have issues")
-            return False
-
-    except Exception as e:
-        print(f"❌ CUTLASS single expert test failed: {e}")
-        import traceback
-
-        traceback.print_exc()
-        return False
-
-
-def test_grouped_operations():
-    """Test full grouped operations"""
-    print("\n🔍 Testing Grouped Operations")
-    print("=" * 50)
-
-    try:
-        from cutlass_backwards_debug import (
-            CUTLASSGroupedGemmStrategyDebug,
-            CUTLASSGroupedLinearDebug,
-        )
-    except ImportError:
-        print("❌ Cannot import debug modules")
-        return False
-
-    device = torch.device("cuda")
-    dtype = torch.bfloat16
-
-    # Test parameters
-    num_experts = 4
-    in_features = 256
-    out_features = 512
-    total_tokens = 128
-
-    # Test different approaches
-    approaches = ["approach_1", "approach_2", "approach_3"]
-
-    for approach in approaches:
-        print(f"\n🔧 Testing grouped operations with {approach}")
-
-        try:
-            # Create strategy
-            strategy = CUTLASSGroupedGemmStrategyDebug(
-                debug_mode=True,
-                backward_method=approach,
-                use_2cta_instrs=False,
-                mma_tiler_mn=(128, 128),
-                cluster_shape_mn=(1, 1),
-            )
-
-            # Create test data
-            input_tokens = torch.randn(
-                total_tokens,
-                in_features,
-                dtype=dtype,
-                device=device,
-                requires_grad=True,
-            )
-            expert_assignments = torch.randint(
-                0, num_experts, (total_tokens,), device=device
-            )
-
-            # Create layer
-            layer = CUTLASSGroupedLinearDebug(
-                num_experts, in_features, out_features, strategy, dtype=dtype
-            )
-            layer = layer.to(device)
-
-            # Forward pass
-            output = layer(input_tokens, expert_assignments)
-
-            # Backward pass
-            loss = output.sum()
-            loss.backward()
-
-            print(f"✅ {approach} completed successfully")
-
-            # Check if gradients exist and are reasonable
-            if input_tokens.grad is not None:
-                input_grad_norm = input_tokens.grad.norm().item()
-                print(f"   Input grad norm: {input_grad_norm:.4f}")
-            else:
-                print("   ❌ No input gradient!")
-
-            if layer.weight.grad is not None:
-                weight_grad_norm = layer.weight.grad.norm().item()
-                print(f"   Weight grad norm: {weight_grad_norm:.4f}")
-            else:
-                print("   ❌ No weight gradient!")
-
-        except Exception as e:
-            print(f"❌ {approach} failed: {e}")
-            import traceback
-
-            traceback.print_exc()
-
-
-def debug_matrix_operations():
-    """Debug the core matrix operations used in backward pass"""
-    print("\n🔍 Debugging Core Matrix Operations")
-    print("=" * 50)
-
-    device = torch.device("cuda")
-    dtype = torch.bfloat16
-
-    # Test data
-    M, N, K = 16, 32, 64
-
-    A = torch.randn(M, K, dtype=dtype, device=device)
-    B = torch.randn(N, K, dtype=dtype, device=device)
-    dC = torch.randn(M, N, dtype=dtype, device=device)
-
-    print(f"A: {A.shape}, B: {B.shape}, dC: {dC.shape}")
-
-    # Test different formulations of the same operations
-    print("\n📊 Testing Input Gradient Formulations:")
-
-    # Method 1: Direct computation dA = dC @ B
-    dA_direct = torch.mm(dC, B)  # [M, N] @ [N, K] = [M, K]
-    print(f"Method 1 (direct): {dA_direct.shape}, norm: {dA_direct.norm().item():.4f}")
-
-    # Method 2: Transpose formulation dA^T = B^T @ dC^T
-    dA_transpose = torch.mm(B.t(), dC.t()).t()  # [K, N] @ [N, M] = [K, M] -> [M, K]
-    print(
-        f"Method 2 (transpose): {dA_transpose.shape}, norm: {dA_transpose.norm().item():.4f}"
-    )
-
-    # Check if they're the same
-    dA_diff = torch.abs(dA_direct - dA_transpose).max().item()
-    print(f"Difference: {dA_diff:.2e}")
-
-    print("\n📊 Testing Weight Gradient Formulations:")
-
-    # Method 1: Direct computation dB = dC^T @ A
-    dB_direct = torch.mm(dC.t(), A)  # [N, M] @ [M, K] = [N, K]
-    print(f"Method 1 (direct): {dB_direct.shape}, norm: {dB_direct.norm().item():.4f}")
-
-    # Method 2: Using transpose dB^T = A^T @ dC, then transpose
-    dB_transpose = torch.mm(A.t(), dC).t()  # [K, M] @ [M, N] = [K, N] -> [N, K]
-    print(
-        f"Method 2 (transpose): {dB_transpose.shape}, norm: {dB_transpose.norm().item():.4f}"
-    )
-
-    # Check if they're the same
-    dB_diff = torch.abs(dB_direct - dB_transpose).max().item()
-    print(f"Difference: {dB_diff:.2e}")
-
-    if dA_diff < 1e-5 and dB_diff < 1e-5:
-        print("✅ All formulations are mathematically equivalent!")
-        return True
-    else:
-        print("❌ Formulations don't match - there's a mathematical error!")
-        return False
-
-
-def main():
-    """Main debug sequence"""
-    print("🧪 CUTLASS Backward Pass Debug Suite")
-    print("=" * 60)
-
-    # Step 4: Test grouped operations
-    print("\n" + "=" * 60)
-    test_grouped_operations()
-
-    if not torch.cuda.is_available():
-        print("❌ CUDA not available")
-        return
-
-    print(f"GPU: {torch.cuda.get_device_name()}")
-    print(f"CUDA Version: {torch.version.cuda}")
-
-    # Step 1: Test mathematical formulations
-    print("\n" + "=" * 60)
-    if not debug_matrix_operations():
-        print("❌ Mathematical formulations failed - fix before continuing")
-        return
-
-    # Step 2: Test single expert operations
-    print("\n" + "=" * 60)
-    single_expert_result = test_single_expert_operations()
-    if single_expert_result is False:
-        print("❌ Single expert operations failed")
-        return
-
-    # Step 3: Test CUTLASS single expert operations
-    print("\n" + "=" * 60)
-    if not test_cutlass_simple_operations():
-        print("❌ CUTLASS single expert operations failed")
-        return
-
-    # Step 4: Test grouped operations
-    print("\n" + "=" * 60)
-    test_grouped_operations()
-
-    print("\n" + "=" * 60)
-    print("🎯 Debug sequence completed!")
-    print("\nNext steps based on results:")
-    print("1. If single expert works but grouped fails -> issue in batching/metadata")
-    print("2. If CUTLASS single expert fails -> issue in CUTLASS setup")
-    print("3. If mathematical formulations fail -> fundamental math error")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/torchtitan/experiments/deepseek_v3/dsl_back_standalone.py b/torchtitan/experiments/deepseek_v3/simple_test_back_gg.py
similarity index 100%
rename from torchtitan/experiments/deepseek_v3/dsl_back_standalone.py
rename to torchtitan/experiments/deepseek_v3/simple_test_back_gg.py

From 4661156c49ac847df6daddc2e12b3344d6ca9bdc Mon Sep 17 00:00:00 2001
From: lessw2020 <lessw@etrillium.com>
Date: Sat, 14 Jun 2025 13:56:13 -0700
Subject: [PATCH 19/34] add _set_cuda_context, update simple backwards test

---
 .../deepseek_v3/bench_gg_blackwell.py         |  14 +--
 .../deepseek_v3/cute_group_gemm.py            |  36 ++++--
 .../deepseek_v3/simple_test_back_gg.py        | 107 +++++++++++++++++-
 3 files changed, 141 insertions(+), 16 deletions(-)

diff --git a/torchtitan/experiments/deepseek_v3/bench_gg_blackwell.py b/torchtitan/experiments/deepseek_v3/bench_gg_blackwell.py
index 0100fa4ab..5122f1f3b 100644
--- a/torchtitan/experiments/deepseek_v3/bench_gg_blackwell.py
+++ b/torchtitan/experiments/deepseek_v3/bench_gg_blackwell.py
@@ -11,7 +11,7 @@
 import torch
 import torch.nn as nn
 
-# torch.cuda.use_t
+torch.backends.cuda.matmul.allow_tf32 = True
 
 # Import triton for benchmarking
 try:
@@ -187,9 +187,9 @@ def benchmark_forward_pass(self, config: dict, warmup: int = 5, rep: int = 20):
         cutlass_layer = None
         if HAS_CUTLASS:
             strategy = create_cutlass_strategy(
-                use_2cta_instrs=False,
-                mma_tiler_mn=(128, 128),
-                cluster_shape_mn=(1, 1),
+                use_2cta_instrs=True,
+                mma_tiler_mn=(256, 128),
+                cluster_shape_mn=(2, 2),
             )
             cutlass_layer = CUTLASSGroupedLinear(
                 config["num_experts"],
@@ -288,9 +288,9 @@ def benchmark_backward_pass(self, config: dict, warmup: int = 5, rep: int = 20):
         cutlass_layer = None
         if HAS_CUTLASS:
             strategy = create_cutlass_strategy(
-                use_2cta_instrs=False,
-                mma_tiler_mn=(128, 128),
-                cluster_shape_mn=(1, 1),
+                use_2cta_instrs=True,
+                mma_tiler_mn=(256, 128),
+                cluster_shape_mn=(2, 2),
             )
             cutlass_layer = CUTLASSGroupedLinear(
                 config["num_experts"],
diff --git a/torchtitan/experiments/deepseek_v3/cute_group_gemm.py b/torchtitan/experiments/deepseek_v3/cute_group_gemm.py
index cafe151d1..65b6a959f 100644
--- a/torchtitan/experiments/deepseek_v3/cute_group_gemm.py
+++ b/torchtitan/experiments/deepseek_v3/cute_group_gemm.py
@@ -1,6 +1,24 @@
 """
 Stride-optimized CUTLASS Group GEMM implementation.
 Uses stride manipulation instead of tensor transpositions for better performance.
+
+errors:
+
+BACKWARD PASS RESULTS:
+------------------------------------------------------------------------------------------
+Config          PyTorch (ms) CUTLASS (ms) Speedup  Correct  Max Diff
+----------------------------------------------------------------------
+Small-4E        1.15         1.53         0.75x    ❌        6.4e+01
+Small-8E        1.69         1.63         1.04x    ❌        6.0e+01
+MoE-7B-Gate     5.84         3.98         1.47x    ❌        1.0e+02
+MoE-7B-Down     5.83         3.86         1.51x    ❌        9.9e+01
+Large-16E       19.65        5.96         3.30x    ❌        7.5e+01
+XLarge-32E      69.61        7.72         9.02x    ❌        7.2e+01
+DeepSeek-64E    813.77       29.82        27.29x   ❌        6.4e+01
+
+📊 Backward Speedup Summary:
+   Average: 6.34x
+   Range: 0.75x - 27.29x
 """
 
 from typing import List, Tuple
@@ -38,11 +56,7 @@ def __init__(
         if not HAS_CUTLASS:
             raise RuntimeError("CUTLASS not available")
 
-        print(f"🚀 Initializing Stride-Optimized CUTLASS strategy...")
-
-        # Force CUDA context creation
-        dummy = torch.zeros(1, device="cuda")
-        dummy.cpu()
+        print(f" Initializing Stride-Optimized CUTLASS strategy...")
 
         self.use_2cta_instrs = use_2cta_instrs
         self.mma_tiler_mn = mma_tiler_mn
@@ -76,10 +90,16 @@ def __init__(
         self._compiled_kernels = {}
         self._tensormap_buffers = {}
 
-        print(f"✅ Stride-optimized strategy initialized:")
-        print(f"   - Zero-copy transpose operations")
-        print(f"   - Stride-based layout manipulation")
         print(f"   - Max active clusters: {self.max_active_clusters}")
+        print(
+            f"kernel params: {self.ACC_DTYPE=}, {use_2cta_instrs=}, {mma_tiler_mn=}, {cluster_shape_mn=}"
+        )
+
+    def _set_cuda_context(self):
+        # Force CUDA context creation
+        dummy = torch.zeros(1, device="cuda")
+        dummy.cpu()
+        del dummy
 
     def _get_tensormap_buffer(self, device):
         """Get or create tensormap buffer"""
diff --git a/torchtitan/experiments/deepseek_v3/simple_test_back_gg.py b/torchtitan/experiments/deepseek_v3/simple_test_back_gg.py
index 639af9169..bb40d5f7e 100644
--- a/torchtitan/experiments/deepseek_v3/simple_test_back_gg.py
+++ b/torchtitan/experiments/deepseek_v3/simple_test_back_gg.py
@@ -1,7 +1,112 @@
 #!/usr/bin/env python3
 """
 Fixed standalone CUTLASS backward pass test.
-Corrects the dimensional issues in the matrix operations.
+
+current:
+Initializing fixed CUTLASS strategy...
+cute hardware - device_id 0
+cute hardware - driver_version 12080
+max_dynamic_shared_memory: 232448
+max_active_blocks: 1
+✅ Strategy initialized (max_active_clusters: 148)
+Forward was: Y = Xtorch.Size([32, 128]) @ W^Ttorch.Size([64, 128])
+Given upstream grad dYtorch.Size([32, 64])
+Computing dX and dW...
+Reference dX: torch.Size([32, 128]), norm: 494.0000
+Reference dW: torch.Size([64, 128]), norm: 492.0000
+
+CUTLASS computation:
+  Computing input gradient: dYtorch.Size([32, 64]) @ Wtorch.Size([64, 128]) = dX[32, 128]
+    CUTLASS setup: dYtorch.Size([32, 64]) @ (W^T)^Ttorch.Size([128, 64]) = dXtorch.Size([32, 128])
+  Executing backward_input: Atorch.Size([32, 64]) @ B^Ttorch.Size([128, 64]) = Ctorch.Size([32, 128])
+    Problem: [32, 128, 64, 1]
+    Strides: [[64, 1], [64, 1], [128, 1]]
+max_dynamic_shared_memory: 232448
+max_active_blocks: 1
+    Compiling kernel for backward_input...
+    ✅ Kernel compiled
+    ✅ backward_input executed
+  Computing weight gradient: dY^Ttorch.Size([32, 64]) @ Xtorch.Size([32, 128]) = dW[64, 128]
+    CUTLASS setup: dY^Ttorch.Size([64, 32]) @ (X^T)^Ttorch.Size([128, 32]) = dWtorch.Size([64, 128])
+  Executing backward_weight: Atorch.Size([64, 32]) @ B^Ttorch.Size([128, 32]) = Ctorch.Size([64, 128])
+    Problem: [64, 128, 32, 1]
+    Strides: [[32, 1], [32, 1], [128, 1]]
+    Compiling kernel for backward_weight...
+    ✅ Kernel compiled
+    ✅ backward_weight executed
+CUTLASS dX: torch.Size([32, 128]), norm: 494.0000
+CUTLASS dW: torch.Size([64, 128]), norm: 492.0000
+
+Comparison:
+dX max diff: 0.00e+00 (relative: 0.00e+00)
+dW max diff: 0.00e+00 (relative: 0.00e+00)
+✅ Complete backward pass works!
+
+============================================================
+
+🔍 Testing Grouped Backward (2 experts)
+=============================================
+🔧 Initializing fixed CUTLASS strategy...
+cute hardware - device_id 0
+cute hardware - driver_version 12080
+max_dynamic_shared_memory: 232448
+max_active_blocks: 1
+✅ Strategy initialized (max_active_clusters: 148)
+Setup: 2 experts, 16 tokens each
+X: torch.Size([32, 64]), W: torch.Size([2, 128, 64]), dY: torch.Size([32, 128])
+Reference dX norm: 516.0000
+Reference dW norm: 524.0000
+
+Expert 0:
+  Computing input gradient: dYtorch.Size([16, 128]) @ Wtorch.Size([128, 64]) = dX[16, 64]
+    CUTLASS setup: dYtorch.Size([16, 128]) @ (W^T)^Ttorch.Size([64, 128]) = dXtorch.Size([16, 64])
+  Executing expert_0_input: Atorch.Size([16, 128]) @ B^Ttorch.Size([64, 128]) = Ctorch.Size([16, 64])
+    Problem: [16, 64, 128, 1]
+    Strides: [[128, 1], [128, 1], [64, 1]]
+max_dynamic_shared_memory: 232448
+max_active_blocks: 1
+    Compiling kernel for expert_0_input...
+    ✅ Kernel compiled
+    ✅ expert_0_input executed
+  Computing weight gradient: dY^Ttorch.Size([16, 128]) @ Xtorch.Size([16, 64]) = dW[128, 64]
+    CUTLASS setup: dY^Ttorch.Size([128, 16]) @ (X^T)^Ttorch.Size([64, 16]) = dWtorch.Size([128, 64])
+  Executing expert_0_weight: Atorch.Size([128, 16]) @ B^Ttorch.Size([64, 16]) = Ctorch.Size([128, 64])
+    Problem: [128, 64, 16, 1]
+    Strides: [[16, 1], [16, 1], [64, 1]]
+    Compiling kernel for expert_0_weight...
+    ✅ Kernel compiled
+    ✅ expert_0_weight executed
+
+Expert 1:
+  Computing input gradient: dYtorch.Size([16, 128]) @ Wtorch.Size([128, 64]) = dX[16, 64]
+    CUTLASS setup: dYtorch.Size([16, 128]) @ (W^T)^Ttorch.Size([64, 128]) = dXtorch.Size([16, 64])
+  Executing expert_1_input: Atorch.Size([16, 128]) @ B^Ttorch.Size([64, 128]) = Ctorch.Size([16, 64])
+    Problem: [16, 64, 128, 1]
+    Strides: [[128, 1], [128, 1], [64, 1]]
+    ✅ expert_1_input executed
+  Computing weight gradient: dY^Ttorch.Size([16, 128]) @ Xtorch.Size([16, 64]) = dW[128, 64]
+    CUTLASS setup: dY^Ttorch.Size([128, 16]) @ (X^T)^Ttorch.Size([64, 16]) = dWtorch.Size([128, 64])
+  Executing expert_1_weight: Atorch.Size([128, 16]) @ B^Ttorch.Size([64, 16]) = Ctorch.Size([128, 64])
+    Problem: [128, 64, 16, 1]
+    Strides: [[16, 1], [16, 1], [64, 1]]
+    ✅ expert_1_weight executed
+
+CUTLASS dX norm: 516.0000
+CUTLASS dW norm: 524.0000
+
+Comparison:
+dX max diff: 0.00e+00 (relative: 0.00e+00)
+dW max diff: 0.00e+00 (relative: 0.00e+00)
+✅ Grouped backward pass works!
+
+============================================================
+📊 FINAL RESULTS
+============================================================
+Basic CUTLASS GEMM   ✅ PASS
+Input Gradient       ✅ PASS
+Weight Gradient      ✅ PASS
+Complete Backward    ✅ PASS
+Grouped Backward     ✅ PASS
 """
 
 import torch

From a288251da0ebee7f8a1429f0c8e039a61c7519e4 Mon Sep 17 00:00:00 2001
From: lessw2020 <lessw@etrillium.com>
Date: Sun, 15 Jun 2025 00:01:46 -0700
Subject: [PATCH 20/34] backwards K mismatch

---
 .../deepseek_v3/cute_group_gemm.py            | 279 ++++++++++++------
 1 file changed, 182 insertions(+), 97 deletions(-)

diff --git a/torchtitan/experiments/deepseek_v3/cute_group_gemm.py b/torchtitan/experiments/deepseek_v3/cute_group_gemm.py
index 65b6a959f..834edca28 100644
--- a/torchtitan/experiments/deepseek_v3/cute_group_gemm.py
+++ b/torchtitan/experiments/deepseek_v3/cute_group_gemm.py
@@ -2,23 +2,128 @@
 Stride-optimized CUTLASS Group GEMM implementation.
 Uses stride manipulation instead of tensor transpositions for better performance.
 
+
 errors:
 
-BACKWARD PASS RESULTS:
-------------------------------------------------------------------------------------------
-Config          PyTorch (ms) CUTLASS (ms) Speedup  Correct  Max Diff
-----------------------------------------------------------------------
-Small-4E        1.15         1.53         0.75x    ❌        6.4e+01
-Small-8E        1.69         1.63         1.04x    ❌        6.0e+01
-MoE-7B-Gate     5.84         3.98         1.47x    ❌        1.0e+02
-MoE-7B-Down     5.83         3.86         1.51x    ❌        9.9e+01
-Large-16E       19.65        5.96         3.30x    ❌        7.5e+01
-XLarge-32E      69.61        7.72         9.02x    ❌        7.2e+01
-DeepSeek-64E    813.77       29.82        27.29x   ❌        6.4e+01
-
-📊 Backward Speedup Summary:
-   Average: 6.34x
-   Range: 0.75x - 27.29x
+
+🔍 Benchmarking Backward Pass: Small-8E
+   8 experts, 1024 tokens, 2048→2048
+ Initializing Stride-Optimized CUTLASS strategy...
+cute hardware - device_id 0
+cute hardware - driver_version 12080
+max_dynamic_shared_memory: 232448
+max_active_blocks: 1
+   - Max active clusters: 33
+kernel params: self.ACC_DTYPE=<class 'cutlass.base_dsl.typing.Float32'>, use_2cta_instrs=True, mma_tiler_mn=(256, 128), cluster_shape_mn=(2, 2)
+max_dynamic_shared_memory: 232448
+max_active_blocks: 1
+❌ Error benchmarking Small-8E: Dimension mismatch: K=135 vs K_B=2048. A: torch.Size([135, 2048]), B: torch.Size([135, 2048]), transpose_A=True, transpose_B=True
+
+======================================================================
+📊 Configuration: MoE-7B-Gate
+   Experts: 8, Tokens: 2048
+   Dimensions: 4096 → 11008
+   Problem size: ~92341.8M operations
+
+🔍 Benchmarking Forward Pass: MoE-7B-Gate
+   8 experts, 2048 tokens, 4096→11008
+ Initializing Stride-Optimized CUTLASS strategy...
+cute hardware - device_id 0
+cute hardware - driver_version 12080
+max_dynamic_shared_memory: 232448
+max_active_blocks: 1
+   - Max active clusters: 33
+kernel params: self.ACC_DTYPE=<class 'cutlass.base_dsl.typing.Float32'>, use_2cta_instrs=True, mma_tiler_mn=(256, 128), cluster_shape_mn=(2, 2)
+max_dynamic_shared_memory: 232448
+max_active_blocks: 1
+   PyTorch:  0.73 ms
+   CUTLASS:  1.06 ms
+   Speedup:  0.69x
+   Correct:  ✅ (max diff: 0.00e+00)
+
+🔍 Benchmarking Backward Pass: MoE-7B-Gate
+   8 experts, 2048 tokens, 4096→11008
+ Initializing Stride-Optimized CUTLASS strategy...
+cute hardware - device_id 0
+cute hardware - driver_version 12080
+max_dynamic_shared_memory: 232448
+max_active_blocks: 1
+   - Max active clusters: 33
+kernel params: self.ACC_DTYPE=<class 'cutlass.base_dsl.typing.Float32'>, use_2cta_instrs=True, mma_tiler_mn=(256, 128), cluster_shape_mn=(2, 2)
+max_dynamic_shared_memory: 232448
+max_active_blocks: 1
+❌ Error benchmarking MoE-7B-Gate: Dimension mismatch: K=11008 vs K_B=4096. A: torch.Size([261, 11008]), B: torch.Size([11008, 4096]), transpose_A=False, transpose_B=True
+
+======================================================================
+📊 Configuration: MoE-7B-Down
+   Experts: 8, Tokens: 2048
+   Dimensions: 11008 → 4096
+   Problem size: ~92341.8M operations
+
+🔍 Benchmarking Forward Pass: MoE-7B-Down
+   8 experts, 2048 tokens, 11008→4096
+ Initializing Stride-Optimized CUTLASS strategy...
+cute hardware - device_id 0
+cute hardware - driver_version 12080
+max_dynamic_shared_memory: 232448
+max_active_blocks: 1
+   - Max active clusters: 33
+kernel params: self.ACC_DTYPE=<class 'cutlass.base_dsl.typing.Float32'>, use_2cta_instrs=True, mma_tiler_mn=(256, 128), cluster_shape_mn=(2, 2)
+max_dynamic_shared_memory: 232448
+max_active_blocks: 1
+   PyTorch:  0.69 ms
+   CUTLASS:  1.02 ms
+   Speedup:  0.68x
+   Correct:  ✅ (max diff: 0.00e+00)
+
+🔍 Benchmarking Backward Pass: MoE-7B-Down
+   8 experts, 2048 tokens, 11008→4096
+ Initializing Stride-Optimized CUTLASS strategy...
+cute hardware - device_id 0
+cute hardware - driver_version 12080
+max_dynamic_shared_memory: 232448
+max_active_blocks: 1
+   - Max active clusters: 33
+kernel params: self.ACC_DTYPE=<class 'cutlass.base_dsl.typing.Float32'>, use_2cta_instrs=True, mma_tiler_mn=(256, 128), cluster_shape_mn=(2, 2)
+max_dynamic_shared_memory: 232448
+max_active_blocks: 1
+❌ Error benchmarking MoE-7B-Down: Dimension mismatch: K=4096 vs K_B=11008. A: torch.Size([260, 4096]), B: torch.Size([4096, 11008]), transpose_A=False, transpose_B=True
+
+======================================================================
+📊 Configuration: Large-16E
+   Experts: 16, Tokens: 4096
+   Dimensions: 4096 → 11008
+   Problem size: ~184683.6M operations
+
+🔍 Benchmarking Forward Pass: Large-16E
+   16 experts, 4096 tokens, 4096→11008
+ Initializing Stride-Optimized CUTLASS strategy...
+cute hardware - device_id 0
+cute hardware - driver_version 12080
+max_dynamic_shared_memory: 232448
+max_active_blocks: 1
+   - Max active clusters: 33
+kernel params: self.ACC_DTYPE=<class 'cutlass.base_dsl.typing.Float32'>, use_2cta_instrs=True, mma_tiler_mn=(256, 128), cluster_shape_mn=(2, 2)
+max_dynamic_shared_memory: 232448
+max_active_blocks: 1
+   PyTorch:  1.27 ms
+   CUTLASS:  1.64 ms
+   Speedup:  0.77x
+   Correct:  ✅ (max diff: 0.00e+00)
+
+🔍 Benchmarking Backward Pass: Large-16E
+   16 experts, 4096 tokens, 4096→11008
+ Initializing Stride-Optimized CUTLASS strategy...
+cute hardware - device_id 0
+cute hardware - driver_version 12080
+max_dynamic_shared_memory: 232448
+max_active_blocks: 1
+   - Max active clusters: 33
+kernel params: self.ACC_DTYPE=<class 'cutlass.base_dsl.typing.Float32'>, use_2cta_instrs=True, mma_tiler_mn=(256, 128), cluster_shape_mn=(2, 2)
+max_dynamic_shared_memory: 232448
+max_active_blocks: 1
+❌ Error benchmarking Large-16E: Dimension mismatch: K=11008 vs K_B=4096. A: torch.Size([269, 11008]), B: torch.Size([11008, 4096]), transpose_A=False, transpose_B=True
+
 """
 
 from typing import List, Tuple
@@ -172,68 +277,64 @@ def execute_stride_grouped_gemm(
     ):
         """
         Execute grouped GEMM with stride-based layout control.
+        CUTLASS always computes A @ B^T.
 
-        Args:
-            operations: List of operation dictionaries with keys:
-                - 'A': tensor A
-                - 'B': tensor B
-                - 'C': output tensor C
-                - 'transpose_A': bool, whether to logically transpose A
-                - 'transpose_B': bool, whether to logically transpose B
-            operation_name: Name for debugging
+        The transpose flags indicate whether to logically transpose the tensor
+        before CUTLASS applies its own B^T operation.
         """
         if not operations:
             return
 
         device = operations[0]["A"].device
-
-        # Prepare metadata for all operations using stride manipulation
         problem_sizes = []
         strides_abc = []
         ptrs_abc = []
 
         for op in operations:
-            A = op["A"].contiguous()  # Ensure contiguous for safe stride manipulation
+            A = op["A"].contiguous()
             B = op["B"].contiguous()
             C = op["C"].contiguous()
             transpose_A = op.get("transpose_A", False)
             transpose_B = op.get("transpose_B", False)
 
-            # Get logical shapes after transpose
+            # Apply logical transposes using stride manipulation
+            # (This mimics what the simple test does with .t().contiguous())
+
             if transpose_A:
-                M, K = A.shape[1], A.shape[0]  # Logical shape after transpose
+                # Logically transpose A: [M,K] -> [K,M]
+                M, K = A.shape[1], A.shape[0]
                 A_strides = self._get_transpose_strides(A)
             else:
-                M, K = A.shape
+                M, K = A.shape[0], A.shape[1]
                 A_strides = list(A.stride())
 
             if transpose_B:
-                N, K_B = B.shape[0], B.shape[1]  # B^T shape
-                # For CUTLASS B^T operation, we need to handle this specially
-                # CUTLASS will transpose B, so if we want B^T, we pass original B
-                B_strides = list(B.stride())
-            else:
-                K_B, N = B.shape
-                # For CUTLASS B^T operation with no logical transpose
-                # We need to pass B with swapped strides to get B^T effect
+                # Logically transpose B: [N,K] -> [K,N]
+                # After this logical transpose, CUTLASS will do [K,N]^T = [N,K]
+                K_B, N = B.shape[1], B.shape[0]
                 B_strides = self._get_transpose_strides(B)
+            else:
+                # No logical transpose, CUTLASS does [N,K]^T = [K,N]
+                N, K_B = B.shape[0], B.shape[1]
+                B_strides = list(B.stride())
 
-            # Validate dimensions
-            assert K == K_B, f"Inner dimension mismatch: {K} != {K_B}"
+            # Validate
+            assert (
+                K == K_B
+            ), f"Dimension mismatch: K={K} vs K_B={K_B}. A: {A.shape}, B: {B.shape}, transpose_A={transpose_A}, transpose_B={transpose_B}"
             assert C.shape == (
                 M,
                 N,
             ), f"Output shape mismatch: expected ({M}, {N}), got {C.shape}"
 
+            # Add to batch
             L = 1
             C_strides = list(C.stride())
-
-            # Add to batch
             problem_sizes.append([M, N, K, L])
             strides_abc.append([A_strides, B_strides, C_strides])
             ptrs_abc.append([A.data_ptr(), B.data_ptr(), C.data_ptr()])
 
-        # Execute grouped kernel
+        # Execute
         self._execute_kernel(
             problem_sizes, strides_abc, ptrs_abc, device, operation_name
         )
@@ -379,23 +480,7 @@ def _execute_forward_stride_optimized(
 
         # Convert to CPU for iteration (minimal sync)
         valid_sizes = m_sizes[valid_indices].cpu().tolist()
-        valid_offsets = (
-            (
-                m_offsets[valid_indices]
-                if len(m_offsets) > len(valid_indices)
-                else torch.cumsum(
-                    torch.cat(
-                        [
-                            torch.tensor([0], device=input_tokens.device),
-                            m_sizes[valid_indices][:-1],
-                        ]
-                    ),
-                    dim=0,
-                )
-            )
-            .cpu()
-            .tolist()
-        )
+        valid_offsets = m_offsets[valid_indices].cpu().tolist()
         valid_indices_cpu = valid_indices.cpu().tolist()
 
         for expert_idx, size, offset in zip(
@@ -410,15 +495,17 @@ def _execute_forward_stride_optimized(
                 expert_output = output[offset : offset + size]  # [M, N]
 
                 # Forward: expert_input @ expert_weight^T
-                # A = expert_input [M, K], B = expert_weight [N, K]
-                # CUTLASS computes A @ B^T = expert_input @ expert_weight^T ✅
+                # Mathematical: expert_input[M, K] @ expert_weight^T[K, N] = expert_output[M, N]
+                # CUTLASS computes: A @ B^T where A = expert_input[M, K], B = expert_weight[N, K]
+                # CUTLASS does: expert_input[M, K] @ expert_weight[N, K]^T = expert_input[M, K] @ expert_weight^T[K, N] ✓
+                # So no logical transpose needed - CUTLASS's built-in transpose gives us what we want
                 operations.append(
                     {
                         "A": expert_input,
                         "B": expert_weight,
                         "C": expert_output,
                         "transpose_A": False,  # No transpose needed
-                        "transpose_B": True,  # CUTLASS will transpose B automatically
+                        "transpose_B": False,  # Let CUTLASS do the transpose naturally
                     }
                 )
 
@@ -437,38 +524,21 @@ def _execute_backward_stride_optimized(
         grad_weight,
         strategy,
     ):
-        """Execute backward pass with stride optimization"""
+        """Execute backward pass with stride optimization - COMPLETELY REWRITTEN"""
         # Convert to CPU for iteration (minimal sync)
         valid_sizes = m_sizes[valid_indices].cpu().tolist()
-        valid_offsets = (
-            (
-                m_offsets[valid_indices]
-                if len(m_offsets) > len(valid_indices)
-                else torch.cumsum(
-                    torch.cat(
-                        [
-                            torch.tensor([0], device=grad_output.device),
-                            m_sizes[valid_indices][:-1],
-                        ]
-                    ),
-                    dim=0,
-                )
-            )
-            .cpu()
-            .tolist()
-        )
+        valid_offsets = m_offsets[valid_indices].cpu().tolist()
         valid_indices_cpu = valid_indices.cpu().tolist()
 
-        # Prepare input gradient operations: dX = dY @ W
+        # Prepare operations based on the exact same logic as the working simple test
         input_operations = []
-        # Prepare weight gradient operations: dW = dY^T @ X
         weight_operations = []
 
         for expert_idx, size, offset in zip(
             valid_indices_cpu, valid_sizes, valid_offsets
         ):
             if size > 0:
-                # Get expert data (all contiguous)
+                # Get expert data
                 expert_grad_output = grad_output[
                     offset : offset + size
                 ].contiguous()  # [M, N]
@@ -479,31 +549,46 @@ def _execute_backward_stride_optimized(
                 expert_grad_input = grad_input[offset : offset + size]  # [M, K]
                 expert_grad_weight = grad_weight[expert_idx]  # [N, K]
 
-                # Input gradient: dX = dY @ W
-                # We need: grad_output[M,N] @ weight[N,K] = grad_input[M,K]
-                # CUTLASS: A @ B^T, so we need B^T = weight^T = [K,N]
-                # Use stride manipulation: tell CUTLASS to interpret weight as [K,N]
+                # INPUT GRADIENT: dX = dY @ W
+                # Mathematical: expert_grad_output[M, N] @ expert_weight[N, K] = expert_grad_input[M, K]
+                # CUTLASS computes: A @ B^T
+                # We need: expert_grad_output[M, N] @ expert_weight[N, K]
+                # So: A = expert_grad_output[M, N], B^T = expert_weight[N, K]
+                # Therefore: B = expert_weight^T[K, N]
+                #
+                # In the simple test: weight_for_cutlass = weight.t().contiguous() makes [K, N]
+                # In stride optimization: we need expert_weight[N, K] to be seen as [K, N]
+                # So we use transpose_B=True to make CUTLASS transpose it first
                 input_operations.append(
                     {
                         "A": expert_grad_output,  # [M, N]
-                        "B": expert_weight,  # [N, K] - will be stride-interpreted as [K, N]
+                        "B": expert_weight,  # [N, K] -> will be transposed to [K, N] by transpose_B=True
                         "C": expert_grad_input,  # [M, K]
                         "transpose_A": False,
-                        "transpose_B": False,  # Use stride manipulation instead of transpose
+                        "transpose_B": True,  # Transpose expert_weight[N,K] to [K,N], then CUTLASS does [K,N]^T = [N,K]
                     }
                 )
 
-                # Weight gradient: dW = dY^T @ X
-                # We need: grad_output^T[N,M] @ input[M,K] = grad_weight[N,K]
-                # CUTLASS: A @ B^T, so A = grad_output^T[N,M], B^T = input^T[K,M]
-                # Use stride manipulation for both A transpose and B transpose
+                # WEIGHT GRADIENT: dW = dY^T @ X
+                # Mathematical: expert_grad_output^T[N, M] @ expert_input[M, K] = expert_grad_weight[N, K]
+                # CUTLASS computes: A @ B^T
+                # We need: expert_grad_output^T[N, M] @ expert_input[M, K]
+                # So: A = expert_grad_output^T[N, M], B^T = expert_input[M, K]
+                # Therefore: B = expert_input^T[K, M]
+                #
+                # In the simple test:
+                # - grad_output_T = grad_output.t().contiguous() makes [N, M]
+                # - input_for_cutlass = input_tokens.t().contiguous() makes [K, M]
+                # In stride optimization:
+                # - expert_grad_output[M, N] needs to be seen as [N, M] -> transpose_A=True
+                # - expert_input[M, K] needs to be seen as [K, M] -> transpose_B=True
                 weight_operations.append(
                     {
-                        "A": expert_grad_output,  # [M, N] - will be stride-interpreted as [N, M]
-                        "B": expert_input,  # [M, K] - will be stride-interpreted as [K, M]
+                        "A": expert_grad_output,  # [M, N] -> will be transposed to [N, M] by transpose_A=True
+                        "B": expert_input,  # [M, K] -> will be transposed to [K, M] by transpose_B=True
                         "C": expert_grad_weight,  # [N, K]
-                        "transpose_A": True,  # Use stride manipulation for transpose
-                        "transpose_B": False,  # CUTLASS handles B^T
+                        "transpose_A": True,  # Transpose expert_grad_output[M,N] to [N,M]
+                        "transpose_B": True,  # Transpose expert_input[M,K] to [K,M], then CUTLASS does [K,M]^T = [M,K]
                     }
                 )
 

From f97cbf6ddd59b62bbc9064f9d68793c892bf2cf6 Mon Sep 17 00:00:00 2001
From: lessw2020 <lessw@etrillium.com>
Date: Fri, 20 Jun 2025 09:47:24 -0700
Subject: [PATCH 21/34] add pytorch_cute_converter

---
 .../experiments/deepseek_v3/cute_tensor.py    |   0
 .../improved_b200_grouped_gemm_strat.py       | 748 ++++++++++++++++++
 .../blackwell/pytorch_cute_converter.py       | 512 ++++++++++++
 3 files changed, 1260 insertions(+)
 create mode 100644 torchtitan/experiments/deepseek_v3/cute_tensor.py
 create mode 100644 torchtitan/experiments/deepseek_v3/improved_b200_grouped_gemm_strat.py
 create mode 100644 torchtitan/experiments/kernels/blackwell/pytorch_cute_converter.py

diff --git a/torchtitan/experiments/deepseek_v3/cute_tensor.py b/torchtitan/experiments/deepseek_v3/cute_tensor.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/torchtitan/experiments/deepseek_v3/improved_b200_grouped_gemm_strat.py b/torchtitan/experiments/deepseek_v3/improved_b200_grouped_gemm_strat.py
new file mode 100644
index 000000000..3c12f91ab
--- /dev/null
+++ b/torchtitan/experiments/deepseek_v3/improved_b200_grouped_gemm_strat.py
@@ -0,0 +1,748 @@
+"""
+"Improved" CUTLASS Group GEMM Strategy using the PyTorch to CUTE converter.
+
+This version leverages the standalone converter classes to simplify tensor conversion
+and metadata preparation, making the code more maintainable and less error-prone.
+"""
+
+from typing import Any, Dict, List, Tuple
+
+import torch
+import torch.nn as nn
+
+try:
+    import cuda.bindings.driver as cuda
+    import cutlass
+    import cutlass.cute as cute
+    import cutlass.utils as utils
+    from cutlass.cute.runtime import from_dlpack
+    from torchtitan.experiments.kernels.blackwell.cute_grouped_gemm import (
+        GroupedGemmKernel,
+    )
+
+    HAS_CUTLASS = True
+except ImportError as e:
+    HAS_CUTLASS = False
+    print(f"❌ CUTLASS import failed: {e}")
+
+
+import logging
+
+from torchtitan.experiments.kernels.blackwell.pytorch_cute_converter import (
+    GroupedGemmTensorManager,
+    PyTorchToCuteConverter,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class ImprovedCUTLASSGroupedGemmStrategy:
+    """
+    Improved CUTLASS grouped GEMM strategy using converter classes.
+
+    This version provides cleaner code with better separation of concerns:
+    - Tensor conversion is handled by dedicated converter classes
+    - Reduced boilerplate and manual tensor manipulation
+    - Better error handling and validation
+    - More maintainable codebase
+    """
+
+    # Configuration constants
+    SUPPORTED_CLUSTER_SHAPES = [
+        (1, 1),
+        (1, 2),
+        (1, 4),
+        (2, 1),
+        (2, 2),
+        (2, 4),
+        (4, 1),
+        (4, 2),
+        (4, 4),
+    ]
+
+    SINGLE_CTA_M_SIZES = [128, 64]
+    DUAL_CTA_M_SIZES = [256, 128]
+    N_SIZE_RANGE = range(32, 257, 32)
+
+    DTYPE_TORCH = torch.bfloat16
+    DTYPE_CUTLASS = cutlass.BFloat16
+    ACC_DTYPE = cutlass.Float32
+    ALIGNMENT = 16
+
+    def __init__(
+        self,
+        custom_activation,
+        use_2cta_instrs: bool = True,
+        mma_tiler_mn: Tuple[int, int] = (256, 128),
+        cluster_shape_mn: Tuple[int, int] = (4, 4),
+    ):
+        """
+        Initialize the improved CUTLASS grouped GEMM strategy.
+
+        Args:
+            custom_activation: Activation function (e.g., SiLU)
+            use_2cta_instrs: Whether to use 2-CTA instructions
+            mma_tiler_mn: MMA tile sizes (M, N)
+            cluster_shape_mn: Cluster shape (M, N)
+        """
+        if not HAS_CUTLASS:
+            raise RuntimeError("CUTLASS not available")
+
+        self.activation_function = custom_activation
+        self.use_2cta_instrs = use_2cta_instrs
+        self.mma_tiler_mn = mma_tiler_mn or self._get_default_mma_tiler()
+        self.cluster_shape_mn = cluster_shape_mn or self._get_default_cluster_shape()
+
+        # Initialize converter and tensor manager
+        self.converter = PyTorchToCuteConverter(
+            default_alignment=self.ALIGNMENT, default_acc_dtype=self.ACC_DTYPE
+        )
+        self.tensor_manager = GroupedGemmTensorManager(
+            alignment=self.ALIGNMENT, dtype=self.DTYPE_TORCH
+        )
+
+        # Initialize CUTLASS components
+        self._initialize_kernel()
+        self._initialize_hardware()
+
+        # Caches
+        self._compiled_kernels = {}
+        self._tensormap_buffers = {}
+
+        self._log_initialization()
+
+    def _get_default_mma_tiler(self) -> Tuple[int, int]:
+        """Get default MMA tiler based on CTA mode."""
+        return (256, 128) if self.use_2cta_instrs else (128, 128)
+
+    def _get_default_cluster_shape(self) -> Tuple[int, int]:
+        """Get default cluster shape based on CTA mode."""
+        return (2, 2) if self.use_2cta_instrs else (1, 1)
+
+    def _initialize_kernel(self):
+        """Initialize the CUTLASS grouped GEMM kernel."""
+        self.grouped_gemm = GroupedGemmKernel(
+            acc_dtype=self.ACC_DTYPE,
+            use_2cta_instrs=self.use_2cta_instrs,
+            mma_tiler_mn=self.mma_tiler_mn,
+            cluster_shape_mn=self.cluster_shape_mn,
+            tensormap_update_mode=utils.TensorMapUpdateMode.SMEM,
+        )
+
+    def _initialize_hardware(self):
+        """Initialize hardware information and CUDA stream."""
+        self.hardware_info = utils.HardwareInfo()
+        self.max_active_clusters = self.hardware_info.get_max_active_clusters(
+            self.cluster_shape_mn[0] * self.cluster_shape_mn[1]
+        )
+
+        torch_stream = torch.cuda.current_stream()
+        self.stream = cuda.CUstream(torch_stream.cuda_stream)
+
+    def _log_initialization(self):
+        """Log initialization details."""
+        cluster_size = self.cluster_shape_mn[0] * self.cluster_shape_mn[1]
+        print(f"✅ Improved CUTLASS Strategy initialized:")
+        print(f"   - 2 CTA mode: {self.use_2cta_instrs}")
+        print(f"   - MMA tiler: {self.mma_tiler_mn}")
+        print(f"   - Cluster shape: {self.cluster_shape_mn}")
+        print(f"   - Max active clusters: {self.max_active_clusters}")
+
+    def arrange_expert_weights(
+        self, all_weights: List[torch.Tensor], submod_name: str, module
+    ) -> torch.Tensor:
+        """Store weights in stacked format."""
+        return torch.stack(all_weights)
+
+    def execute(
+        self,
+        contig_tokens: torch.Tensor,
+        m_sizes: torch.Tensor,
+        m_offsets: torch.Tensor,
+        module,
+    ) -> torch.Tensor:
+        """
+        Execute grouped GEMM operation using improved tensor management.
+
+        Args:
+            contig_tokens: Input tokens arranged contiguously by expert
+            m_sizes: Expert sizes tensor
+            m_offsets: Expert offsets tensor
+            module: MoE module containing weights
+
+        Returns:
+            Processed output tokens
+        """
+        # Ensure GPU tensors to avoid CPU-GPU sync
+        m_sizes_gpu, m_offsets_gpu = self._ensure_gpu_tensors(
+            m_sizes, m_offsets, contig_tokens.device
+        )
+
+        # Get weights and validate
+        weights = self._get_and_validate_weights(module)
+        device = contig_tokens.device
+
+        # Early exit if no valid experts
+        if not self._has_valid_experts(m_sizes_gpu):
+            return torch.zeros(
+                contig_tokens.shape[0],
+                weights["gate"].shape[2],
+                dtype=self.DTYPE_TORCH,
+                device=device,
+            )
+
+        # Execute three-stage MoE computation
+        return self._execute_moe_computation(
+            contig_tokens, weights, m_sizes_gpu, m_offsets_gpu, device
+        )
+
+    def _ensure_gpu_tensors(
+        self, m_sizes, m_offsets, device
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Ensure sizes and offsets are GPU tensors."""
+        if not isinstance(m_sizes, torch.Tensor):
+            m_sizes_gpu = torch.tensor(m_sizes, dtype=torch.int32, device=device)
+        else:
+            m_sizes_gpu = m_sizes.to(device=device, dtype=torch.int32)
+
+        if not isinstance(m_offsets, torch.Tensor):
+            m_offsets_gpu = torch.tensor(m_offsets, dtype=torch.int32, device=device)
+        else:
+            m_offsets_gpu = m_offsets.to(device=device, dtype=torch.int32)
+
+        return m_sizes_gpu, m_offsets_gpu
+
+    def _get_and_validate_weights(self, module) -> Dict[str, torch.Tensor]:
+        """Extract and validate weight tensors."""
+        required_weights = ["gate_proj_weight", "up_proj_weight", "down_proj_weight"]
+        weights = {}
+
+        for weight_name in required_weights:
+            if not hasattr(module, weight_name):
+                raise ValueError(f"Module missing required weight: {weight_name}")
+            weights[weight_name.split("_")[0]] = module.get_parameter(weight_name)
+
+        return weights
+
+    def _has_valid_experts(self, m_sizes_gpu: torch.Tensor) -> bool:
+        """Check if any experts have tokens (single sync point)."""
+        return torch.any(m_sizes_gpu > 0).item()
+
+    def _execute_moe_computation(
+        self,
+        contig_tokens: torch.Tensor,
+        weights: Dict[str, torch.Tensor],
+        m_sizes_gpu: torch.Tensor,
+        m_offsets_gpu: torch.Tensor,
+        device: torch.device,
+    ) -> torch.Tensor:
+        """Execute the complete MoE computation pipeline."""
+
+        # Stage 1: Gate and Up projections
+        gate_outputs, up_outputs = self._execute_gate_up_projections(
+            contig_tokens,
+            weights["gate"],
+            weights["up"],
+            m_sizes_gpu,
+            m_offsets_gpu,
+            device,
+        )
+
+        # Stage 2: Apply activation and combine
+        hidden_states = self._apply_activation_and_combine(gate_outputs, up_outputs)
+
+        # Stage 3: Down projection
+        down_outputs = self._execute_down_projection(
+            hidden_states, weights["down"], m_sizes_gpu, device
+        )
+
+        # Stage 4: Reconstruct output
+        return self._reconstruct_output(
+            down_outputs, contig_tokens, m_sizes_gpu, m_offsets_gpu
+        )
+
+    def _execute_gate_up_projections(
+        self,
+        input_tokens: torch.Tensor,
+        gate_weights: torch.Tensor,
+        up_weights: torch.Tensor,
+        m_sizes_gpu: torch.Tensor,
+        m_offsets_gpu: torch.Tensor,
+        device: torch.device,
+    ) -> Tuple[List, List]:
+        """Execute gate and up projections using the tensor manager."""
+
+        # Get valid experts
+        valid_mask = m_sizes_gpu > 0
+        valid_indices = torch.nonzero(valid_mask, as_tuple=True)[0]
+
+        if len(valid_indices) == 0:
+            return [], []
+
+        # Prepare expert operations using tensor manager
+        gate_ops, up_ops = self._prepare_gate_up_operations(
+            input_tokens,
+            gate_weights,
+            up_weights,
+            m_sizes_gpu,
+            m_offsets_gpu,
+            valid_indices,
+            device,
+        )
+
+        # Execute grouped GEMMs
+        if gate_ops["inputs"]:
+            self._execute_grouped_gemm_operations(gate_ops, device, "gate_up")
+
+        return gate_ops["outputs"], up_ops["outputs"]
+
+    def _prepare_gate_up_operations(
+        self,
+        input_tokens: torch.Tensor,
+        gate_weights: torch.Tensor,
+        up_weights: torch.Tensor,
+        m_sizes_gpu: torch.Tensor,
+        m_offsets_gpu: torch.Tensor,
+        valid_indices: torch.Tensor,
+        device: torch.device,
+    ) -> Tuple[Dict, Dict]:
+        """Prepare gate and up operations using the tensor manager."""
+
+        # Convert indices for iteration (minimal sync)
+        valid_indices_cpu = valid_indices.cpu().tolist()
+        valid_sizes = m_sizes_gpu[valid_indices].cpu().tolist()
+        valid_offsets = (
+            self._compute_valid_offsets(
+                m_sizes_gpu, m_offsets_gpu, valid_indices, device
+            )
+            .cpu()
+            .tolist()
+        )
+
+        # Prepare operation lists
+        gate_ops = {"inputs": [], "weights": [], "outputs": [], "metadata": []}
+        up_ops = {"inputs": [], "weights": [], "outputs": [], "metadata": []}
+
+        for expert_idx, size, offset in zip(
+            valid_indices_cpu, valid_sizes, valid_offsets
+        ):
+            if size > 0:
+                # Get expert data
+                expert_input = input_tokens[offset : offset + size].contiguous()
+                gate_weight = gate_weights[expert_idx].contiguous()
+                up_weight = up_weights[expert_idx].contiguous()
+
+                # Create output tensors
+                M, K = expert_input.shape
+                N = gate_weight.shape[0]  # Assuming [out_features, in_features]
+
+                gate_output = torch.empty(M, N, dtype=self.DTYPE_TORCH, device=device)
+                up_output = torch.empty(M, N, dtype=self.DTYPE_TORCH, device=device)
+
+                # Use tensor manager to prepare operations
+                for ops, weight, output in [
+                    (gate_ops, gate_weight, gate_output),
+                    (up_ops, up_weight, up_output),
+                ]:
+
+                    (
+                        cute_input,
+                        cute_weight,
+                        cute_output,
+                        problem_size,
+                        strides,
+                        ptrs,
+                    ) = self.tensor_manager.prepare_expert_operation(
+                        expert_input, weight, output, transpose_weight=True
+                    )
+
+                    ops["inputs"].append(expert_input)
+                    ops["weights"].append(weight)
+                    ops["outputs"].append(output)
+                    ops["metadata"].append((problem_size, strides, ptrs))
+
+        return gate_ops, up_ops
+
+    def _compute_valid_offsets(
+        self,
+        m_sizes_gpu: torch.Tensor,
+        m_offsets_gpu: torch.Tensor,
+        valid_indices: torch.Tensor,
+        device: torch.device,
+    ) -> torch.Tensor:
+        """Compute valid offsets for expert operations."""
+        valid_sizes = m_sizes_gpu[valid_indices]
+
+        if len(m_offsets_gpu) > len(valid_indices):
+            return m_offsets_gpu[valid_indices]
+        else:
+            return torch.cumsum(
+                torch.cat([torch.tensor([0], device=device), valid_sizes[:-1]]), dim=0
+            )
+
+    def _execute_down_projection(
+        self,
+        hidden_states: List[torch.Tensor],
+        down_weights: torch.Tensor,
+        m_sizes_gpu: torch.Tensor,
+        device: torch.device,
+    ) -> List[torch.Tensor]:
+        """Execute down projection using tensor manager."""
+
+        if not hidden_states:
+            return []
+
+        # Get valid expert indices
+        valid_mask = m_sizes_gpu > 0
+        valid_indices = torch.nonzero(valid_mask, as_tuple=True)[0]
+        valid_indices_cpu = valid_indices.cpu().tolist()
+
+        # Prepare down projection operations
+        down_ops = {"inputs": [], "weights": [], "outputs": [], "metadata": []}
+
+        for i, expert_idx in enumerate(valid_indices_cpu):
+            if i < len(hidden_states):
+                hidden = hidden_states[i]
+                down_weight = down_weights[expert_idx].contiguous()
+
+                M, K = hidden.shape
+                N = down_weight.shape[0]
+                down_output = torch.empty(M, N, dtype=self.DTYPE_TORCH, device=device)
+
+                # Use tensor manager
+                cute_input, cute_weight, cute_output, problem_size, strides, ptrs = (
+                    self.tensor_manager.prepare_expert_operation(
+                        hidden, down_weight, down_output, transpose_weight=True
+                    )
+                )
+
+                down_ops["inputs"].append(hidden)
+                down_ops["weights"].append(down_weight)
+                down_ops["outputs"].append(down_output)
+                down_ops["metadata"].append((problem_size, strides, ptrs))
+
+        # Execute grouped GEMM
+        if down_ops["inputs"]:
+            self._execute_grouped_gemm_operations(down_ops, device, "down")
+
+        return down_ops["outputs"]
+
+    def _execute_grouped_gemm_operations(
+        self, operations: Dict, device: torch.device, stage_name: str
+    ):
+        """Execute grouped GEMM operations using converter."""
+
+        if not operations["metadata"]:
+            return
+
+        # Extract metadata
+        all_problem_sizes = []
+        all_strides = []
+        all_ptrs = []
+
+        for problem_size, strides, ptrs in operations["metadata"]:
+            all_problem_sizes.append(problem_size)
+            all_strides.append(strides)
+            all_ptrs.append(ptrs)
+
+        # Create CUTE metadata tensors using converter
+        problem_sizes_cute, strides_cute, ptrs_cute = (
+            self.converter.create_metadata_tensors(
+                all_problem_sizes, all_strides, all_ptrs, device
+            )
+        )
+
+        # Get other required tensors
+        num_groups = len(all_problem_sizes)
+        total_clusters = self._compute_total_clusters(all_problem_sizes)
+        tensormap_cute = self._get_tensormap_buffer(device)
+
+        # Create initial tensors for compilation using converter
+        initial_tensors = self.converter.create_initial_compilation_tensors(
+            tuple(all_problem_sizes[0]), device, self.DTYPE_TORCH
+        )
+
+        # Compile and execute kernel
+        compiled_kernel = self._get_or_compile_kernel(
+            num_groups,
+            total_clusters,
+            initial_tensors,
+            problem_sizes_cute,
+            strides_cute,
+            ptrs_cute,
+            tensormap_cute,
+        )
+
+        # Execute
+        compiled_kernel(
+            *initial_tensors,
+            problem_sizes_cute,
+            strides_cute,
+            ptrs_cute,
+            tensormap_cute,
+            self.stream,
+        )
+        torch.cuda.synchronize()
+
+    def _get_or_compile_kernel(
+        self,
+        num_groups: int,
+        total_clusters: int,
+        initial_tensors: List,
+        problem_sizes_cute,
+        strides_cute,
+        ptrs_cute,
+        tensormap_cute,
+    ):
+        """Get compiled kernel from cache or compile new one."""
+
+        cache_key = (
+            num_groups,
+            total_clusters,
+            self.use_2cta_instrs,
+            self.mma_tiler_mn,
+            self.cluster_shape_mn,
+        )
+
+        if cache_key not in self._compiled_kernels:
+            print(f"Compiling kernel: {num_groups} groups, 2CTA={self.use_2cta_instrs}")
+
+            self._compiled_kernels[cache_key] = cute.compile(
+                self.grouped_gemm,
+                *initial_tensors,
+                num_groups,
+                problem_sizes_cute,
+                strides_cute,
+                ptrs_cute,
+                total_clusters,
+                tensormap_cute,
+                self.max_active_clusters,
+                self.stream,
+            )
+            print("✅ Kernel compilation successful")
+
+        return self._compiled_kernels[cache_key]
+
+    def _get_tensormap_buffer(self, device: torch.device):
+        """Get tensormap buffer using converter."""
+        if device not in self._tensormap_buffers:
+            sm_count = self.hardware_info.get_max_active_clusters(1)
+            self._tensormap_buffers[device] = self.converter.create_tensormap_buffer(
+                device, sm_count, tensormap_count=3, tensormap_bytes=128
+            )
+        return self._tensormap_buffers[device]
+
+    def _compute_total_clusters(self, problem_sizes: List[List[int]]) -> int:
+        """Compute total clusters needed for all problems."""
+        cluster_tile_m = self.mma_tiler_mn[0]
+        cluster_tile_n = self.mma_tiler_mn[1]
+
+        if self.use_2cta_instrs:
+            cluster_tile_m //= 2
+
+        cluster_tile_m *= self.cluster_shape_mn[0]
+        cluster_tile_n *= self.cluster_shape_mn[1]
+
+        total = 0
+        for M, N, K, L in problem_sizes:
+            clusters_m = (M + cluster_tile_m - 1) // cluster_tile_m
+            clusters_n = (N + cluster_tile_n - 1) // cluster_tile_n
+            total += clusters_m * clusters_n
+
+        return total
+
+    def _apply_activation_and_combine(
+        self, gate_outputs: List[torch.Tensor], up_outputs: List[torch.Tensor]
+    ) -> List[torch.Tensor]:
+        """Apply activation function and combine gate/up outputs."""
+        if not gate_outputs or not up_outputs:
+            return []
+
+        return [
+            self.activation_function(gate_out) * up_out
+            for gate_out, up_out in zip(gate_outputs, up_outputs)
+        ]
+
+    def _reconstruct_output(
+        self,
+        down_outputs: List[torch.Tensor],
+        contig_tokens: torch.Tensor,
+        m_sizes_gpu: torch.Tensor,
+        m_offsets_gpu: torch.Tensor,
+    ) -> torch.Tensor:
+        """Reconstruct the full output tensor."""
+
+        # Initialize output
+        output = torch.zeros(
+            contig_tokens.shape[0],
+            down_outputs[0].shape[1] if down_outputs else contig_tokens.shape[1],
+            dtype=self.DTYPE_TORCH,
+            device=contig_tokens.device,
+        )
+
+        if not down_outputs:
+            return output
+
+        # Get valid expert information
+        valid_mask = m_sizes_gpu > 0
+        valid_indices = torch.nonzero(valid_mask, as_tuple=True)[0]
+        valid_sizes = m_sizes_gpu[valid_indices].cpu().tolist()
+        valid_offsets = (
+            self._compute_valid_offsets(
+                m_sizes_gpu, m_offsets_gpu, valid_indices, contig_tokens.device
+            )
+            .cpu()
+            .tolist()
+        )
+
+        # Copy results back
+        for i, (size, offset) in enumerate(zip(valid_sizes, valid_offsets)):
+            if i < len(down_outputs) and size > 0:
+                output[offset : offset + size] = down_outputs[i]
+
+        return output
+
+    @staticmethod
+    def is_available() -> bool:
+        """Check if CUTLASS is available."""
+        return HAS_CUTLASS
+
+
+# Factory function for easy creation
+def create_improved_cutlass_strategy(
+    custom_activation,
+    use_2cta_instrs: bool = True,
+    mma_tiler_mn: Tuple[int, int] = (256, 128),
+    cluster_shape_mn: Tuple[int, int] = (4, 4),
+) -> ImprovedCUTLASSGroupedGemmStrategy:
+    """
+    Factory function to create improved CUTLASS strategy.
+
+    Args:
+        custom_activation: Activation function
+        use_2cta_instrs: Use 2-CTA instructions
+        mma_tiler_mn: MMA tile sizes
+        cluster_shape_mn: Cluster shape
+
+    Returns:
+        Configured strategy instance
+    """
+    return ImprovedCUTLASSGroupedGemmStrategy(
+        custom_activation=custom_activation,
+        use_2cta_instrs=use_2cta_instrs,
+        mma_tiler_mn=mma_tiler_mn,
+        cluster_shape_mn=cluster_shape_mn,
+    )
+
+
+# Test function
+def test_improved_strategy():
+    """Test the improved CUTLASS strategy."""
+    if not HAS_CUTLASS:
+        print("❌ CUTLASS not available for testing")
+        return False
+
+    print("🧪 Testing Improved CUTLASS Strategy")
+    print("=" * 50)
+
+    # note - we have to make a pytorch cuda context or this will fail
+    dummy_tensor = torch.randn(1, 1, device="cuda")
+    a = dummy_tensor.to("cpu").item()
+
+    try:
+        import torch.nn.functional as F
+
+        device = torch.device("cuda")
+        dtype = torch.bfloat16
+
+        # Test parameters
+        num_experts = 8
+        in_features = 2048
+        out_features = 4096
+        intermediate_size = 8192
+        total_tokens = 1024
+
+        # Create strategy
+        strategy = create_improved_cutlass_strategy(
+            custom_activation=F.silu,
+            use_2cta_instrs=True,
+            mma_tiler_mn=(256, 128),
+            cluster_shape_mn=(2, 2),
+        )
+
+        # Create mock module with weights
+        class MockModule:
+            def __init__(self):
+                self.gate_proj_weight = torch.randn(
+                    num_experts,
+                    intermediate_size,
+                    in_features,
+                    dtype=dtype,
+                    device=device,
+                )
+                self.up_proj_weight = torch.randn(
+                    num_experts,
+                    intermediate_size,
+                    in_features,
+                    dtype=dtype,
+                    device=device,
+                )
+                self.down_proj_weight = torch.randn(
+                    num_experts,
+                    out_features,
+                    intermediate_size,
+                    dtype=dtype,
+                    device=device,
+                )
+
+            def get_parameter(self, name):
+                return getattr(self, name)
+
+        module = MockModule()
+
+        # Create test data
+        contig_tokens = torch.randn(
+            total_tokens, in_features, dtype=dtype, device=device
+        )
+        expert_assignments = torch.randint(
+            0, num_experts, (total_tokens,), device=device
+        )
+
+        # Compute expert sizes and offsets
+        m_sizes = torch.zeros(num_experts, dtype=torch.int32, device=device)
+        for expert_idx in range(num_experts):
+            m_sizes[expert_idx] = (expert_assignments == expert_idx).sum()
+
+        m_offsets = torch.cat(
+            [torch.tensor([0], device=device), torch.cumsum(m_sizes, dim=0)]
+        )
+
+        print(f"Expert sizes: {m_sizes.cpu().tolist()}")
+
+        # Execute strategy
+        print("Executing improved CUTLASS strategy...")
+        output = strategy.execute(contig_tokens, m_sizes, m_offsets, module)
+
+        print(f"✅ Execution successful!")
+        print(f"   Output shape: {output.shape}")
+        print(f"   Output norm: {output.norm().item():.4f}")
+        print(f"   Output dtype: {output.dtype}")
+
+        # Validate output
+        assert output.shape == (total_tokens, out_features)
+        assert output.dtype == dtype
+        assert torch.isfinite(output).all()
+
+        print("✅ All validations passed!")
+        return True
+
+    except Exception as e:
+        print(f"❌ Test failed: {e}")
+        import traceback
+
+        traceback.print_exc()
+        return False
+
+
+if __name__ == "__main__":
+    test_improved_strategy()
diff --git a/torchtitan/experiments/kernels/blackwell/pytorch_cute_converter.py b/torchtitan/experiments/kernels/blackwell/pytorch_cute_converter.py
new file mode 100644
index 000000000..462a1bc25
--- /dev/null
+++ b/torchtitan/experiments/kernels/blackwell/pytorch_cute_converter.py
@@ -0,0 +1,512 @@
+"""
+Standalone PyTorch to CUTE tensor converter for CUTLASS Group GEMM operations.
+
+This module provides utilities to convert PyTorch tensors to CUTE tensors
+with proper layout, alignment, and data type handling for CUTLASS kernels.
+"""
+
+from typing import List, Optional, Tuple, Union
+
+import torch
+
+try:
+    import cutlass
+    import cutlass.cute as cute
+    from cutlass.cute.runtime import from_dlpack
+
+    HAS_CUTLASS = True
+except ImportError as e:
+    HAS_CUTLASS = False
+    print(f"❌ CUTLASS import failed: {e}")
+
+
+class PyTorchToCuteConverter:
+    """
+    Converter class for PyTorch tensors to CUTE tensors for CUTLASS Group GEMM.
+
+    Handles data type mapping, memory layout, alignment, and stride manipulation
+    for optimal CUTLASS kernel performance.
+    """
+
+    # Data type mappings
+    DTYPE_MAP = {
+        torch.float16: cutlass.Float16,
+        torch.bfloat16: cutlass.BFloat16,
+        torch.float32: cutlass.Float32,
+        torch.int8: cutlass.Int8,
+        torch.int32: cutlass.Int32,
+        torch.int64: cutlass.Int64,
+    }
+
+    def __init__(self, default_alignment: int = 16, default_acc_dtype=cutlass.Float32):
+        """
+        Initialize the converter.
+
+        Args:
+            default_alignment: Memory alignment requirement for CUTE tensors
+            default_acc_dtype: Default accumulation data type for CUTLASS
+        """
+        if not HAS_CUTLASS:
+            raise RuntimeError("CUTLASS not available")
+
+        self.default_alignment = default_alignment
+        self.default_acc_dtype = default_acc_dtype
+
+    def get_cutlass_dtype(self, torch_dtype: torch.dtype):
+        """Convert PyTorch dtype to CUTLASS dtype."""
+        if torch_dtype not in self.DTYPE_MAP:
+            raise ValueError(f"Unsupported PyTorch dtype: {torch_dtype}")
+        return self.DTYPE_MAP[torch_dtype]
+
+    def torch_to_cute_tensor(
+        self,
+        tensor: torch.Tensor,
+        alignment: Optional[int] = None,
+        make_dynamic: bool = True,
+        dynamic_leading_dim: int = 1,
+    ) -> "cute.Tensor":
+        """
+        Convert a single PyTorch tensor to CUTE tensor.
+
+        Args:
+            tensor: Input PyTorch tensor
+            alignment: Memory alignment (uses default if None)
+            make_dynamic: Whether to mark layout as dynamic
+            dynamic_leading_dim: Which dimension to make dynamic
+
+        Returns:
+            CUTE tensor ready for CUTLASS operations
+        """
+        if not HAS_CUTLASS:
+            raise RuntimeError("CUTLASS not available")
+
+        # Ensure tensor is contiguous
+        tensor = tensor.contiguous()
+
+        # Convert to MNKL format (add batch dimension if needed)
+        if len(tensor.shape) == 2:
+            mnkl_tensor = tensor.unsqueeze(-1).contiguous()
+        else:
+            mnkl_tensor = tensor
+
+        # Get alignment
+        align = alignment or self.default_alignment
+
+        # Convert to CUTE tensor
+        cute_tensor = from_dlpack(mnkl_tensor, assumed_align=align)
+
+        # Set element type
+        cutlass_dtype = self.get_cutlass_dtype(tensor.dtype)
+        cute_tensor.element_type = cutlass_dtype
+
+        # Make layout dynamic if requested
+        if make_dynamic:
+            cute_tensor = cute_tensor.mark_layout_dynamic(
+                leading_dim=dynamic_leading_dim
+            )
+
+        return cute_tensor
+
+    def create_grouped_gemm_tensors(
+        self,
+        A_tensors: List[torch.Tensor],
+        B_tensors: List[torch.Tensor],
+        C_tensors: List[torch.Tensor],
+        alignment: Optional[int] = None,
+    ) -> Tuple[List, List, List]:
+        """
+        Convert lists of PyTorch tensors to CUTE tensors for grouped GEMM.
+
+        Args:
+            A_tensors: List of A matrices (input tensors)
+            B_tensors: List of B matrices (weight tensors)
+            C_tensors: List of C matrices (output tensors)
+            alignment: Memory alignment
+
+        Returns:
+            Tuple of (cute_A_tensors, cute_B_tensors, cute_C_tensors)
+        """
+        cute_A = [self.torch_to_cute_tensor(A, alignment) for A in A_tensors]
+        cute_B = [self.torch_to_cute_tensor(B, alignment) for B in B_tensors]
+        cute_C = [self.torch_to_cute_tensor(C, alignment) for C in C_tensors]
+
+        return cute_A, cute_B, cute_C
+
+    def create_metadata_tensors(
+        self,
+        problem_sizes: List[List[int]],
+        strides_abc: List[List[List[int]]],
+        ptrs_abc: List[List[int]],
+        device: torch.device,
+        alignment: Optional[int] = None,
+    ) -> Tuple:
+        """
+        Create CUTE tensors for grouped GEMM metadata.
+
+        Args:
+            problem_sizes: List of [M, N, K, L] for each problem
+            strides_abc: List of stride information for A, B, C tensors
+            ptrs_abc: List of data pointers for A, B, C tensors
+            device: Target device
+            alignment: Memory alignment
+
+        Returns:
+            Tuple of (problem_sizes_cute, strides_cute, ptrs_cute)
+        """
+        align = alignment or self.default_alignment
+
+        # Convert to PyTorch tensors first
+        problem_sizes_tensor = torch.tensor(
+            problem_sizes, dtype=torch.int32, device=device
+        )
+        strides_tensor = torch.tensor(strides_abc, dtype=torch.int32, device=device)
+        ptrs_tensor = torch.tensor(ptrs_abc, dtype=torch.int64, device=device)
+
+        # Convert to CUTE tensors
+        problem_sizes_cute = from_dlpack(problem_sizes_tensor, assumed_align=align)
+        strides_cute = from_dlpack(strides_tensor, assumed_align=align)
+        ptrs_cute = from_dlpack(ptrs_tensor, assumed_align=align)
+
+        return problem_sizes_cute, strides_cute, ptrs_cute
+
+    def create_initial_compilation_tensors(
+        self,
+        problem_shape: Tuple[int, int, int, int],
+        device: torch.device,
+        dtype: torch.dtype = torch.bfloat16,
+        alignment: Optional[int] = None,
+    ) -> List:
+        """
+        Create initial tensors needed for CUTLASS kernel compilation.
+
+        Args:
+            problem_shape: (M, N, K, L) shape tuple
+            device: Target device
+            dtype: PyTorch data type
+            alignment: Memory alignment
+
+        Returns:
+            List of CUTE tensors for kernel compilation
+        """
+        M, N, K, L = problem_shape
+        align = alignment or self.default_alignment
+
+        # Create PyTorch tensors
+        tensors = [
+            torch.randn(M, K, dtype=dtype, device=device),  # A
+            torch.randn(N, K, dtype=dtype, device=device),  # B
+            torch.zeros(M, N, dtype=dtype, device=device),  # C
+        ]
+
+        # Convert to CUTE tensors
+        cute_tensors = []
+        for tensor in tensors:
+            mnkl_tensor = tensor.unsqueeze(-1).contiguous()
+            cute_tensor = from_dlpack(mnkl_tensor, assumed_align=align)
+            cute_tensor.element_type = self.get_cutlass_dtype(dtype)
+            cute_tensor = cute_tensor.mark_layout_dynamic(leading_dim=1)
+            cute_tensors.append(cute_tensor)
+
+        return cute_tensors
+
+    def create_tensormap_buffer(
+        self,
+        device: torch.device,
+        sm_count: int,
+        tensormap_count: int = 3,
+        tensormap_bytes: int = 128,
+        alignment: Optional[int] = None,
+    ):
+        """
+        Create tensormap buffer for CUTLASS kernel.
+
+        Args:
+            device: Target device
+            sm_count: Number of streaming multiprocessors
+            tensormap_count: Number of tensormap entries
+            tensormap_bytes: Bytes per tensormap entry
+            alignment: Memory alignment
+
+        Returns:
+            CUTE tensor for tensormap buffer
+        """
+        align = alignment or self.default_alignment
+
+        tensormap_tensor = torch.zeros(
+            (sm_count, tensormap_count, tensormap_bytes // 8),
+            dtype=torch.int64,
+            device=device,
+        )
+
+        return from_dlpack(tensormap_tensor, assumed_align=align)
+
+
+class GroupedGemmTensorManager:
+    """
+    High-level manager for grouped GEMM tensor operations.
+
+    Provides simplified interface for common grouped GEMM tensor conversion patterns.
+    """
+
+    def __init__(self, alignment: int = 16, dtype: torch.dtype = torch.bfloat16):
+        """
+        Initialize the tensor manager.
+
+        Args:
+            alignment: Memory alignment for CUTE tensors
+            dtype: Default PyTorch data type
+        """
+        self.converter = PyTorchToCuteConverter(default_alignment=alignment)
+        self.dtype = dtype
+        self.alignment = alignment
+
+    def prepare_expert_operation(
+        self,
+        input_tokens: torch.Tensor,
+        expert_weights: torch.Tensor,
+        output_tensor: torch.Tensor,
+        transpose_weight: bool = True,
+    ) -> Tuple:
+        """
+        Prepare tensors for a single expert operation (e.g., one GEMM in grouped GEMM).
+
+        Args:
+            input_tokens: Input tensor [M, K]
+            expert_weights: Weight tensor [N, K] or [K, N]
+            output_tensor: Output tensor [M, N]
+            transpose_weight: Whether weight needs transposition
+
+        Returns:
+            Tuple of (cute_input, cute_weight, cute_output, problem_size, strides, ptrs)
+        """
+        # Ensure tensors are contiguous
+        input_tokens = input_tokens.contiguous()
+        expert_weights = expert_weights.contiguous()
+        output_tensor = output_tensor.contiguous()
+
+        # Handle weight transposition
+        if transpose_weight and len(expert_weights.shape) == 2:
+            # For CUTLASS, we often need weights in specific layout
+            # This handles the common case where PyTorch weights are [out_features, in_features]
+            # but CUTLASS expects [in_features, out_features] or specific stride pattern
+            print(
+                f"Warning: weight transposition not supported...recommend using strides for this case"
+            )
+            pass  # Keep original - handle via strides in CUTLASS
+
+        # Convert to CUTE tensors
+        cute_input = self.converter.torch_to_cute_tensor(input_tokens)
+        cute_weight = self.converter.torch_to_cute_tensor(expert_weights)
+        cute_output = self.converter.torch_to_cute_tensor(output_tensor)
+
+        # Prepare metadata
+        M, K = input_tokens.shape
+        if transpose_weight:
+            N = expert_weights.shape[0]  # Weight is [N, K]
+        else:
+            N = expert_weights.shape[1]  # Weight is [K, N]
+        L = 1
+
+        problem_size = [M, N, K, L]
+
+        # Get strides (handle MNKL format)
+        input_mnkl = input_tokens.unsqueeze(-1)
+        weight_mnkl = expert_weights.unsqueeze(-1)
+        output_mnkl = output_tensor.unsqueeze(-1)
+
+        strides = [
+            list(input_mnkl.stride()[:2]),
+            list(weight_mnkl.stride()[:2]),
+            list(output_mnkl.stride()[:2]),
+        ]
+
+        ptrs = [
+            input_tokens.data_ptr(),
+            expert_weights.data_ptr(),
+            output_tensor.data_ptr(),
+        ]
+
+        return cute_input, cute_weight, cute_output, problem_size, strides, ptrs
+
+    def prepare_grouped_operation(
+        self,
+        input_list: List[torch.Tensor],
+        weight_list: List[torch.Tensor],
+        output_list: List[torch.Tensor],
+        device: torch.device,
+        transpose_weights: bool = True,
+    ) -> Tuple:
+        """
+        Prepare tensors for grouped GEMM operation.
+
+        Args:
+            input_list: List of input tensors
+            weight_list: List of weight tensors
+            output_list: List of output tensors
+            device: Target device
+            transpose_weights: Whether weights need transposition
+
+        Returns:
+            Tuple of (initial_tensors, problem_sizes_cute, strides_cute, ptrs_cute)
+        """
+        if not (len(input_list) == len(weight_list) == len(output_list)):
+            raise ValueError("All lists must have the same length")
+
+        # Collect metadata for all operations
+        all_problem_sizes = []
+        all_strides = []
+        all_ptrs = []
+
+        for inp, weight, out in zip(input_list, weight_list, output_list):
+            _, _, _, problem_size, strides, ptrs = self.prepare_expert_operation(
+                inp, weight, out, transpose_weights
+            )
+            all_problem_sizes.append(problem_size)
+            all_strides.append(strides)
+            all_ptrs.append(ptrs)
+
+        # Create metadata tensors
+        problem_sizes_cute, strides_cute, ptrs_cute = (
+            self.converter.create_metadata_tensors(
+                all_problem_sizes, all_strides, all_ptrs, device
+            )
+        )
+
+        # Create initial tensors for compilation (use first problem size as template)
+        initial_tensors = self.converter.create_initial_compilation_tensors(
+            tuple(all_problem_sizes[0]), device, self.dtype
+        )
+
+        return initial_tensors, problem_sizes_cute, strides_cute, ptrs_cute
+
+
+# Convenience functions for common use cases
+def pytorch_to_cute_tensor(tensor: torch.Tensor, alignment: int = 16) -> "cute.Tensor":
+    """
+    Simple conversion function for single PyTorch tensor to CUTE tensor.
+
+    Args:
+        tensor: PyTorch tensor to convert
+        alignment: Memory alignment requirement
+
+    Returns:
+        CUTE tensor
+    """
+    converter = PyTorchToCuteConverter(alignment)
+    return converter.torch_to_cute_tensor(tensor)
+
+
+def prepare_moe_expert_batch(
+    input_tokens: torch.Tensor,
+    expert_weights: torch.Tensor,
+    m_sizes: torch.Tensor,
+    m_offsets: torch.Tensor,
+    transpose_weights: bool = True,
+) -> Tuple:
+    """
+    Prepare batch of expert operations for MoE grouped GEMM.
+
+    Args:
+        input_tokens: All input tokens [total_tokens, in_features]
+        expert_weights: Stacked expert weights [num_experts, out_features, in_features]
+        m_sizes: Number of tokens per expert [num_experts]
+        m_offsets: Token offsets per expert [num_experts + 1]
+        transpose_weights: Whether to transpose weights
+
+    Returns:
+        Prepared tensors and metadata for grouped GEMM
+    """
+    manager = GroupedGemmTensorManager()
+    device = input_tokens.device
+
+    # Prepare individual expert operations
+    input_list = []
+    weight_list = []
+    output_list = []
+
+    # Convert sizes and offsets to CPU for iteration
+    sizes_cpu = m_sizes.cpu().tolist()
+    offsets_cpu = m_offsets.cpu().tolist()
+
+    for expert_idx, size in enumerate(sizes_cpu):
+        if size > 0:
+            offset = offsets_cpu[expert_idx]
+
+            # Get expert data
+            expert_input = input_tokens[offset : offset + size].contiguous()
+            expert_weight = expert_weights[expert_idx].contiguous()
+
+            # Create output tensor
+            M, K = expert_input.shape
+            N = expert_weight.shape[0] if transpose_weights else expert_weight.shape[1]
+            expert_output = torch.empty(M, N, dtype=input_tokens.dtype, device=device)
+
+            input_list.append(expert_input)
+            weight_list.append(expert_weight)
+            output_list.append(expert_output)
+
+    return manager.prepare_grouped_operation(
+        input_list, weight_list, output_list, device, transpose_weights
+    )
+
+
+# Example usage and testing
+def test_converter():
+    """Test the PyTorch to CUTE tensor converter."""
+    if not HAS_CUTLASS:
+        print("❌ CUTLASS not available for testing")
+        return False
+
+    print("🧪 Testing PyTorch to CUTE Tensor Converter")
+    print("=" * 50)
+
+    try:
+        device = torch.device("cuda")
+        dtype = torch.bfloat16
+
+        # Create test data
+        M, N, K = 128, 256, 512
+        input_tensor = torch.randn(M, K, dtype=dtype, device=device)
+        weight_tensor = torch.randn(N, K, dtype=dtype, device=device)
+        output_tensor = torch.zeros(M, N, dtype=dtype, device=device)
+
+        # Test single tensor conversion
+        converter = PyTorchToCuteConverter()
+        cute_input = converter.torch_to_cute_tensor(input_tensor)
+        print(f"✅ Single tensor conversion successful")
+        print(f"   Input shape: {input_tensor.shape} -> CUTE tensor created")
+
+        # Test tensor manager
+        manager = GroupedGemmTensorManager()
+        cute_inp, cute_weight, cute_out, problem_size, strides, ptrs = (
+            manager.prepare_expert_operation(input_tensor, weight_tensor, output_tensor)
+        )
+        print(f"✅ Expert operation preparation successful")
+        print(f"   Problem size: {problem_size}")
+
+        # Test grouped operation preparation
+        input_list = [input_tensor, input_tensor[:64]]
+        weight_list = [weight_tensor, weight_tensor]
+        output_list = [output_tensor, output_tensor[:64]]
+
+        initial_tensors, prob_cute, strides_cute, ptrs_cute = (
+            manager.prepare_grouped_operation(
+                input_list, weight_list, output_list, device
+            )
+        )
+        print(f"✅ Grouped operation preparation successful")
+        print(f"   Number of operations: {len(input_list)}")
+
+        print("\n✅ All tests passed!")
+        return True
+
+    except Exception as e:
+        print(f"❌ Test failed: {e}")
+        import traceback
+
+        traceback.print_exc()
+        return False
+
+
+if __name__ == "__main__":
+    test_converter()

From 5a8cb9c11f9a3c125424bd1147108bade18b5a2f Mon Sep 17 00:00:00 2001
From: lessw2020 <lessw@etrillium.com>
Date: Fri, 20 Jun 2025 09:49:26 -0700
Subject: [PATCH 22/34] remove transpose warning - we handle via strides

---
 .../experiments/kernels/blackwell/pytorch_cute_converter.py   | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/torchtitan/experiments/kernels/blackwell/pytorch_cute_converter.py b/torchtitan/experiments/kernels/blackwell/pytorch_cute_converter.py
index 462a1bc25..e16752e31 100644
--- a/torchtitan/experiments/kernels/blackwell/pytorch_cute_converter.py
+++ b/torchtitan/experiments/kernels/blackwell/pytorch_cute_converter.py
@@ -289,9 +289,7 @@ def prepare_expert_operation(
             # For CUTLASS, we often need weights in specific layout
             # This handles the common case where PyTorch weights are [out_features, in_features]
             # but CUTLASS expects [in_features, out_features] or specific stride pattern
-            print(
-                f"Warning: weight transposition not supported...recommend using strides for this case"
-            )
+
             pass  # Keep original - handle via strides in CUTLASS
 
         # Convert to CUTE tensors

From d4d6314be9b094c9e68fd7505edc862c7752ec3f Mon Sep 17 00:00:00 2001
From: lessw2020 <lessw@etrillium.com>
Date: Fri, 20 Jun 2025 22:40:20 -0700
Subject: [PATCH 23/34] ds inference all working again, blackwell group gemm
 and manual looping

---
 torchtitan/experiments/deepseek_v3/generate.py |  6 +++---
 torchtitan/experiments/deepseek_v3/model.py    | 11 ++++++++++-
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/torchtitan/experiments/deepseek_v3/generate.py b/torchtitan/experiments/deepseek_v3/generate.py
index 67b551a2f..4c26052b3 100644
--- a/torchtitan/experiments/deepseek_v3/generate.py
+++ b/torchtitan/experiments/deepseek_v3/generate.py
@@ -19,9 +19,9 @@
 from model_config import deepseek_config_registry
 from torch.distributed.device_mesh import DeviceMesh
 from torch.distributed.pipelining import PipelineStage, ScheduleGPipe
-from transformers import AutoTokenizer
 
 from torchtitan.tools.utils import Color
+from transformers import AutoTokenizer
 
 # Uncomment the model you want to run.
 model_id, mesh_shape = "deepseek-ai/DeepSeek-V2-Lite-Chat", (1, 4)
@@ -127,7 +127,7 @@ def create_model(dist_config: DistConfig):
     model_args.ep_size = dist_config.ep_size
     model_args.num_stages = dist_config.pp_size
     model_args.stage_idx = dist_config.pp_rank
-    model_args.max_seq_len = 4096  # 16384
+    model_args.max_seq_len = 1024  # 16384
 
     with dist_config.device, dist_config.mesh:
         model = DeepseekForCausalLM(model_args)
@@ -375,7 +375,7 @@ def generate_with_cuda_graph(
     ]
 
     generate(model, pp_schedule, tokenizer, dist_config, messages)
-    generate_with_cuda_graph(model, tokenizer, dist_config, messages)
+    # generate_with_cuda_graph(model, tokenizer, dist_config, messages)
 
     if rank == 0:
         print(f"\n{color.yellow}Closing inference mesh...{color.reset}")
diff --git a/torchtitan/experiments/deepseek_v3/model.py b/torchtitan/experiments/deepseek_v3/model.py
index 131b1ea2b..82c9e4dc7 100644
--- a/torchtitan/experiments/deepseek_v3/model.py
+++ b/torchtitan/experiments/deepseek_v3/model.py
@@ -45,7 +45,9 @@
 from attn_mask_utils import _prepare_4d_causal_attention_mask
 
 from group_gemms import (
+    CUTLASSGroupedGemmStrategy,
     DSGroupGEMM,
+    ManualLoopGroupGEMM,
     TorchAOBF16GroupGEMM,
     TorchBF16GroupGEMM,
     TorchFP8GroupGEMM,
@@ -474,7 +476,7 @@ class MoE(nn.Module):
     # Group GEMM strategies
     group_gemm_strategies = None
     # which group gemm to use?
-    group_mm = "torch"  # fp8 options = ["torchfp8", "dsgemm"] bf16 = ["torch", , "torchao", "tritoncg"]
+    group_mm = "cutlass"  # fp8 options = ["torchfp8", "dsgemm"] bf16 = ["manual", "torch", , "torchao", "tritoncg"]
 
     def __init__(self, config):
         super().__init__()
@@ -550,6 +552,12 @@ def _initialize_group_gemm_strategies(cls):
                 if TritonCGBF16GroupGEMM.is_available()
                 else None
             ),
+            "manual": ManualLoopGroupGEMM(MLP.act_fn),
+            "cutlass": (
+                CUTLASSGroupedGemmStrategy(MLP.act_fn)
+                if CUTLASSGroupedGemmStrategy.is_available()
+                else None
+            ),
         }
 
     def combine_experts(self, submod_name: str):
@@ -856,6 +864,7 @@ def moe_on_device(self, x, topk_ids, topk_weight):
 
         # Prepare buffer for tokens processed by experts
         processed_tokens = self.get_gather_buf()
+        # processed_tokens.to("cuda")
 
         # Move into Symmetric Memory for the return shuffle
         processed_tokens[permuted_indices] = hidden_outputs

From f2a146a215ce03a0594a0d50cf0f032becfde013 Mon Sep 17 00:00:00 2001
From: lessw2020 <lessw@etrillium.com>
Date: Sat, 21 Jun 2025 20:30:38 -0700
Subject: [PATCH 24/34] standalone version for cutlass gg

---
 ...ped_gemm_strat.py => cute_interface_gg.py} |  20 +-
 .../deepseek_v3/cutlass_grouped_gemm.py       | 830 ++++++++++++++++++
 .../experiments/deepseek_v3/generate.py       |   3 +
 .../experiments/deepseek_v3/group_gemms.py    | 670 +++++++++++++-
 torchtitan/experiments/deepseek_v3/model.py   |  17 +-
 5 files changed, 1531 insertions(+), 9 deletions(-)
 rename torchtitan/experiments/deepseek_v3/{improved_b200_grouped_gemm_strat.py => cute_interface_gg.py} (97%)
 create mode 100644 torchtitan/experiments/deepseek_v3/cutlass_grouped_gemm.py

diff --git a/torchtitan/experiments/deepseek_v3/improved_b200_grouped_gemm_strat.py b/torchtitan/experiments/deepseek_v3/cute_interface_gg.py
similarity index 97%
rename from torchtitan/experiments/deepseek_v3/improved_b200_grouped_gemm_strat.py
rename to torchtitan/experiments/deepseek_v3/cute_interface_gg.py
index 3c12f91ab..82a79be45 100644
--- a/torchtitan/experiments/deepseek_v3/improved_b200_grouped_gemm_strat.py
+++ b/torchtitan/experiments/deepseek_v3/cute_interface_gg.py
@@ -131,6 +131,7 @@ def _initialize_kernel(self):
 
     def _initialize_hardware(self):
         """Initialize hardware information and CUDA stream."""
+        # TODO - if we do not have a cuda context, this will fail...
         self.hardware_info = utils.HardwareInfo()
         self.max_active_clusters = self.hardware_info.get_max_active_clusters(
             self.cluster_shape_mn[0] * self.cluster_shape_mn[1]
@@ -152,6 +153,7 @@ def arrange_expert_weights(
         self, all_weights: List[torch.Tensor], submod_name: str, module
     ) -> torch.Tensor:
         """Store weights in stacked format."""
+        # TODO - let's pre-transsose...
         return torch.stack(all_weights)
 
     def execute(
@@ -237,26 +239,34 @@ def _execute_moe_computation(
         device: torch.device,
     ) -> torch.Tensor:
         """Execute the complete MoE computation pipeline."""
+        print(f"⚙️  Executing MoE computation on {device}")
+        print(f"Stage 1: Gate and Up projections")
+        if m_sizes_gpu.requires_grad:
+            m_sizes_gpu = m_sizes_gpu.detach()
+            m_offsets_gpu = m_offsets_gpu.detach()
 
         # Stage 1: Gate and Up projections
         gate_outputs, up_outputs = self._execute_gate_up_projections(
             contig_tokens,
-            weights["gate"],
-            weights["up"],
+            weights["gate"].detach(),
+            weights["up"].detach(),
             m_sizes_gpu,
             m_offsets_gpu,
             device,
         )
 
+        print(f"Stage 2: Apply activation and combine")
         # Stage 2: Apply activation and combine
         hidden_states = self._apply_activation_and_combine(gate_outputs, up_outputs)
 
         # Stage 3: Down projection
+        print(f"Stage 3: Down projection")
         down_outputs = self._execute_down_projection(
-            hidden_states, weights["down"], m_sizes_gpu, device
+            hidden_states, weights["down"].detach(), m_sizes_gpu, device
         )
 
         # Stage 4: Reconstruct output
+        print(f"Stage 4: Reconstruct output")
         return self._reconstruct_output(
             down_outputs, contig_tokens, m_sizes_gpu, m_offsets_gpu
         )
@@ -462,7 +472,7 @@ def _execute_grouped_gemm_operations(
             tuple(all_problem_sizes[0]), device, self.DTYPE_TORCH
         )
 
-        # Compile and execute kernel
+        # Get or Compile kernel
         compiled_kernel = self._get_or_compile_kernel(
             num_groups,
             total_clusters,
@@ -642,7 +652,7 @@ def test_improved_strategy():
         print("❌ CUTLASS not available for testing")
         return False
 
-    print("🧪 Testing Improved CUTLASS Strategy")
+    print("Testing Improved CUTLASS Strategy")
     print("=" * 50)
 
     # note - we have to make a pytorch cuda context or this will fail
diff --git a/torchtitan/experiments/deepseek_v3/cutlass_grouped_gemm.py b/torchtitan/experiments/deepseek_v3/cutlass_grouped_gemm.py
new file mode 100644
index 000000000..cdfc2673a
--- /dev/null
+++ b/torchtitan/experiments/deepseek_v3/cutlass_grouped_gemm.py
@@ -0,0 +1,830 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+CUTLASS GroupedGEMM Strategy for Blackwell architecture.
+
+This module contains the CUTLASSGroupedGemmStrategy implementation that uses
+CUTLASS GroupedGemmKernel for high-performance group GEMM operations with
+pre-transposed weights and optional input validation.
+"""
+
+"""
+current error:
+The expanded size of the tensor (1408) must match the existing size (2048) at non-singleton dimension 1.  Target sizes: [12288, 1408].  Tensor sizes: [12288, 2048]
+
+"""
+
+import logging
+
+import torch
+
+try:
+    import cuda.bindings.driver as cuda
+    import cutlass
+    import cutlass.cute as cute
+    import cutlass.torch as cutlass_torch
+    import cutlass.utils as utils
+    from cutlass.cute.runtime import from_dlpack
+    from torchtitan.experiments.kernels.blackwell.cute_grouped_gemm import (
+        GroupedGemmKernel,
+    )
+
+    HAS_CUTLASS = True
+    print("✓ CUTLASS and strategies imported successfully")
+except ImportError as e:
+    HAS_CUTLASS = False
+    print(f"✗ CUTLASS import failed: {e}")
+    print("CUTLASSGroupedGemmStrategy will not be available")
+
+# Import base class
+from .group_gemms import GroupGEMMStrategy
+
+logger = logging.getLogger(__name__)
+
+
+class CUTLASSGroupedGemmStrategy(GroupGEMMStrategy):
+    """
+    Strategy using CUTLASS GroupedGemmKernel for group GEMM operations on Blackwell architecture.
+
+    This version is optimized for minimal CPU-GPU synchronization with the following features:
+    - Pre-transposed weights during arrangement (eliminates runtime transpose overhead)
+    - GPU-first processing with batched CPU transfers when unavoidable
+    - Deferred synchronization until absolutely necessary for control flow
+    - Filtered operations on GPU before any CPU transfer
+    - Optional input validation that can be disabled for production performance
+
+    Sync Optimization Strategy:
+    - Keep all computations on GPU as long as possible
+    - Batch multiple CPU transfers into single operations
+    - Filter invalid/empty data on GPU before CPU transfer
+    - Use GPU tensor operations instead of CPU iteration where possible
+    - Only sync when control flow decisions are absolutely required
+    """
+
+    # Constants for Blackwell architecture support
+    SUPPORTED_CLUSTER_SHAPES = [
+        (1, 1),
+        (1, 2),
+        (1, 4),
+        (2, 1),
+        (2, 2),
+        (2, 4),
+        (4, 1),
+        (4, 2),
+        (4, 4),
+    ]
+
+    SINGLE_CTA_M_SIZES = [128, 64]
+    DUAL_CTA_M_SIZES = [256, 128]
+    N_SIZE_RANGE = range(32, 257, 32)  # 32 - 256, step 32
+
+    DTYPE_TORCH = torch.bfloat16
+    DTYPE_CUTLASS = cutlass.BFloat16
+    ACC_DTYPE = cutlass.Float32
+    ALIGNMENT = 16
+    TENSORMAP_COUNT = 3
+    TENSORMAP_BYTES = 128
+
+    def __init__(
+        self,
+        custom_activation,
+        use_2cta_instrs=True,
+        mma_tiler_mn=(256, 128),
+        cluster_shape_mn=(4, 4),
+        validate=True,
+        debug_shapes=True,
+    ):
+        """
+        Initialize the CUTLASS grouped GEMM strategy for Blackwell architecture.
+
+        Args:
+            custom_activation: Activation function to use (e.g., SiLU)
+            use_2cta_instrs: Whether to use 2 CTA instructions for better performance
+            mma_tiler_mn: MMA tiler configuration (M, N)
+            cluster_shape_mn: Cluster shape configuration (M, N)
+            validate: Whether to validate inputs (disable for performance in production)
+            debug_shapes: Whether to log tensor shapes for debugging dimension mismatches
+        """
+        super().__init__(custom_activation)
+        self.use_2cta_instrs = use_2cta_instrs
+        self.validate = validate
+        self.debug_shapes = True  # debug_shapes
+
+        # Set configuration
+        self.mma_tiler_mn = mma_tiler_mn or self._get_default_mma_tiler()
+        self.cluster_shape_mn = cluster_shape_mn or self._get_default_cluster_shape()
+
+        # Validate configurations only if validation is enabled
+        if self.validate:
+            self._validate_configurations()
+
+        # Initialize kernel and hardware info
+        self._initialize_kernel()
+        self._initialize_hardware()
+
+        # Initialize caches
+        self._compiled_kernels = {}
+        self._tensormap_buffers = {}
+
+        self._log_initialization()
+
+    def _get_default_mma_tiler(self):
+        """Get default MMA tiler configuration based on CTA mode."""
+        return (256, 128) if self.use_2cta_instrs else (128, 128)
+
+    def _get_default_cluster_shape(self):
+        """Get default cluster shape based on CTA mode."""
+        return (2, 2) if self.use_2cta_instrs else (1, 1)
+
+    def _initialize_kernel(self):
+        """Initialize the CUTLASS grouped GEMM kernel."""
+        self.grouped_gemm = GroupedGemmKernel(
+            acc_dtype=self.ACC_DTYPE,
+            use_2cta_instrs=self.use_2cta_instrs,
+            mma_tiler_mn=self.mma_tiler_mn,
+            cluster_shape_mn=self.cluster_shape_mn,
+            tensormap_update_mode=utils.TensorMapUpdateMode.SMEM,
+        )
+
+    def _initialize_hardware(self):
+        """Initialize hardware information and stream."""
+        self.hardware_info = utils.HardwareInfo()
+        self.max_active_clusters = self.hardware_info.get_max_active_clusters(
+            self.cluster_shape_mn[0] * self.cluster_shape_mn[1]
+        )
+
+        torch_stream = torch.cuda.current_stream()
+        self.stream = cuda.CUstream(torch_stream.cuda_stream)
+
+    def _validate_configurations(self):
+        """Validate configurations for Blackwell."""
+        self._validate_mma_tiler()
+        self._validate_cluster_shape()
+        self._validate_2cta_constraints()
+
+    def _validate_mma_tiler(self):
+        """Validate MMA tiler configuration."""
+        m_size, n_size = self.mma_tiler_mn
+
+        valid_m_sizes = (
+            self.DUAL_CTA_M_SIZES if self.use_2cta_instrs else self.SINGLE_CTA_M_SIZES
+        )
+        mode_name = "2 CTA" if self.use_2cta_instrs else "single CTA"
+
+        if m_size not in valid_m_sizes:
+            raise ValueError(
+                f"For {mode_name} mode on Blackwell, MMA tiler M must be in {valid_m_sizes}, got {m_size}"
+            )
+
+        if n_size not in self.N_SIZE_RANGE:
+            raise ValueError(
+                f"MMA tiler N must be in range [32, 256] with step 32, got {n_size}"
+            )
+
+    def _validate_cluster_shape(self):
+        """Validate cluster shape configuration."""
+        if self.cluster_shape_mn not in self.SUPPORTED_CLUSTER_SHAPES:
+            raise ValueError(
+                f"Cluster shape {self.cluster_shape_mn} not supported on Blackwell. "
+                f"Valid cluster shapes are: {self.SUPPORTED_CLUSTER_SHAPES}"
+            )
+
+    def _validate_2cta_constraints(self):
+        """Validate 2 CTA specific constraints."""
+        if self.use_2cta_instrs and self.cluster_shape_mn[0] % 2 != 0:
+            valid_2cta_shapes = [
+                shape for shape in self.SUPPORTED_CLUSTER_SHAPES if shape[0] % 2 == 0
+            ]
+            raise ValueError(
+                f"For 2 CTA mode, cluster shape M must be even, got {self.cluster_shape_mn[0]}. "
+                f"Valid 2 CTA cluster shapes: {valid_2cta_shapes}"
+            )
+
+    def _log_initialization(self):
+        """Log initialization information."""
+        cluster_size = self.cluster_shape_mn[0] * self.cluster_shape_mn[1]
+        logger.info(f"Initialized CUTLASSGroupedGemmStrategy for Blackwell with:")
+        logger.info(f"  - 2 CTA instructions: {self.use_2cta_instrs}")
+        logger.info(f"  - MMA tiler (M, N): {self.mma_tiler_mn}")
+        logger.info(f"  - Cluster shape (M, N): {self.cluster_shape_mn}")
+        logger.info(f"  - Cluster size: {cluster_size}")
+        logger.info(f"  - Pre-transposed weights: Enabled")
+        logger.info(
+            f"  - Input validation: {'Enabled' if self.validate else 'Disabled'}"
+        )
+        logger.info(f"  - CPU-GPU sync optimization: Enabled")
+        logger.info(
+            f"  - Debug shapes: {'Enabled' if self.debug_shapes else 'Disabled'}"
+        )
+        if cluster_size > 1:
+            logger.info(f"  - Using multi-CTA parallelism")
+
+    def _debug_log_shapes(self, message, **tensors):
+        """Log tensor shapes for debugging if debug_shapes is enabled"""
+        if self.debug_shapes:
+            shape_info = []
+            for name, tensor in tensors.items():
+                if hasattr(tensor, "shape"):
+                    shape_info.append(f"{name}: {tensor.shape}")
+                else:
+                    shape_info.append(f"{name}: {type(tensor)}")
+            logger.info(f"[SHAPE DEBUG] {message} - {', '.join(shape_info)}")
+
+    def arrange_expert_weights(self, all_weights, submod_name, module):
+        """Store weights in pre-transposed stacked format for optimal CUTLASS performance."""
+        # Pre-transpose weights from [out_dim, in_dim] to [in_dim, out_dim]
+        # This eliminates the need for transpose operations during execution
+
+        # Note: Different projections have different original shapes:
+        # - gate/up: [intermediate_size, hidden_size] -> transpose to [hidden_size, intermediate_size]
+        # - down: [hidden_size, intermediate_size] -> transpose to [intermediate_size, hidden_size]
+        # for w in all_weights:
+        #    print(f"weight shape: {w.shape}, ")
+        # transposed_weights = [w.t().contiguous() for w in all_weights]
+        return torch.stack(all_weights)
+
+    def execute(self, contig_tokens, m_sizes, m_offsets, module):
+        """
+        Execute using CUTLASS grouped GEMM kernel with pre-transposed weights.
+
+        Args:
+            contig_tokens: Input tokens arranged contiguously by expert
+            m_sizes: Tensor of expert sizes (GPU tensor to avoid sync)
+            m_offsets: Tensor of expert offsets (GPU tensor to avoid sync)
+            module: MoE module containing pre-transposed weights
+        """
+        # Convert to GPU tensors if needed (avoid CPU-GPU sync)
+        m_sizes_gpu, m_offsets_gpu = self._ensure_gpu_tensors(
+            m_sizes, m_offsets, contig_tokens.device
+        )
+
+        # Validate inputs only if validation is enabled
+        if self.validate:
+            self._validate_inputs(contig_tokens, m_sizes_gpu, module)
+
+        # Get pre-transposed weights and device
+        weights = self._get_weights(module)
+        device = contig_tokens.device
+
+        # Debug logging
+        self._debug_log_shapes(
+            "Input tensors",
+            contig_tokens=contig_tokens,
+            gate_weights=weights["gate"],
+            up_weights=weights["up"],
+            down_weights=weights["down"],
+        )
+
+        # Prepare output tensor
+        output = torch.zeros(
+            contig_tokens.shape[0],
+            weights["gate"].shape[2],  # output dimension is now last dimension
+            dtype=self.DTYPE_TORCH,
+            device=device,
+        )
+
+        # Check for valid experts using GPU operations (defer sync)
+        has_valid_experts = self._has_valid_experts_gpu(m_sizes_gpu)
+
+        # Early exit if no valid experts (minimal sync only when needed)
+        if not has_valid_experts.item():
+            return output
+
+        # Execute the three-stage computation using GPU-only operations with pre-transposed weights
+        gate_outputs, up_outputs = self._execute_projections_gpu(
+            contig_tokens,
+            weights["gate"],
+            weights["up"],
+            m_sizes_gpu,
+            m_offsets_gpu,
+            device,
+        )
+
+        hidden_states = self._apply_activation_and_combine(gate_outputs, up_outputs)
+
+        final_outputs = self._execute_down_projection_gpu(
+            hidden_states, weights["down"], m_sizes_gpu, device
+        )
+
+        return self._reconstruct_output_gpu(
+            final_outputs, m_sizes_gpu, m_offsets_gpu, output
+        )
+
+    def _ensure_gpu_tensors(self, m_sizes, m_offsets, device):
+        """Ensure sizes and offsets are GPU tensors with minimal CPU-GPU sync"""
+        # Convert m_sizes
+        if not isinstance(m_sizes, torch.Tensor):
+            m_sizes_gpu = torch.tensor(m_sizes, dtype=torch.int32, device=device)
+        else:
+            # Only move if not already on correct device (avoids unnecessary transfer)
+            if m_sizes.device != device or m_sizes.dtype != torch.int32:
+                m_sizes_gpu = m_sizes.to(device=device, dtype=torch.int32)
+            else:
+                m_sizes_gpu = m_sizes
+
+        # Convert m_offsets
+        if not isinstance(m_offsets, torch.Tensor):
+            m_offsets_gpu = torch.tensor(m_offsets, dtype=torch.int32, device=device)
+        else:
+            # Only move if not already on correct device (avoids unnecessary transfer)
+            if m_offsets.device != device or m_offsets.dtype != torch.int32:
+                m_offsets_gpu = m_offsets.to(device=device, dtype=torch.int32)
+            else:
+                m_offsets_gpu = m_offsets
+
+        return m_sizes_gpu, m_offsets_gpu
+
+    def _has_valid_experts_gpu(self, m_sizes_gpu):
+        """Check if any experts have tokens using GPU operations (no sync)."""
+        # Return the tensor itself - let caller decide when to sync
+        return torch.any(m_sizes_gpu > 0)
+
+    def _validate_inputs(self, contig_tokens, m_sizes_gpu, module):
+        """Validate input parameters with minimal GPU sync"""
+        # Check dtype without sync (comparison is done on device info)
+        if contig_tokens.dtype != self.DTYPE_TORCH:
+            raise ValueError(
+                f"Expected input dtype {self.DTYPE_TORCH}, got {contig_tokens.dtype}"
+            )
+
+        # Check tensor dimensionality (no sync needed)
+        if len(contig_tokens.shape) != 2:
+            raise ValueError(
+                f"Expected 2D input tensor, got shape {contig_tokens.shape}"
+            )
+
+        # Check parameter existence (no sync needed)
+        required_params = ["gate_proj_weight", "up_proj_weight", "down_proj_weight"]
+        for param in required_params:
+            if not hasattr(module, param) or module.get_parameter(param) is None:
+                raise ValueError(f"Module missing required parameter: {param}")
+
+        # Note: Avoid checking tensor values or sizes that would require GPU sync
+
+    def _get_weights(self, module):
+        """Extract and return pre-transposed weight tensors from module."""
+        return {
+            "gate": module.get_parameter(
+                "gate_proj_weight"
+            ),  # [num_experts, in_dim, out_dim]
+            "up": module.get_parameter("up_proj_weight"),
+            "down": module.get_parameter("down_proj_weight"),
+        }
+
+    def _execute_projections_gpu(
+        self, input_tokens, weight1, weight2, m_sizes_gpu, m_offsets_gpu, device
+    ):
+        """Execute gate and up projections using GPU-only operations with pre-transposed weights."""
+        # Find valid experts using GPU operations
+        valid_mask = m_sizes_gpu > 0
+        valid_indices = torch.nonzero(valid_mask, as_tuple=True)[0]
+
+        if len(valid_indices) == 0:
+            return [], []
+
+        # Prepare metadata in batch using GPU operations
+        problem_sizes, strides_abc, ptrs_abc, gate_outputs, up_outputs = (
+            self._prepare_gate_up_metadata_gpu(
+                input_tokens,
+                weight1,
+                weight2,
+                m_sizes_gpu,
+                m_offsets_gpu,
+                valid_indices,
+                device,
+            )
+        )
+
+        if len(problem_sizes) == 0:
+            return [], []
+
+        # Execute grouped GEMM
+        self._execute_grouped_gemm(problem_sizes, strides_abc, ptrs_abc, device)
+
+        return gate_outputs, up_outputs
+
+    def _prepare_gate_up_metadata_gpu(
+        self,
+        input_tokens,
+        gate_weights,
+        up_weights,
+        m_sizes_gpu,
+        m_offsets_gpu,
+        valid_indices,
+        device,
+    ):
+        """Prepare metadata for gate and up projections with minimal CPU-GPU sync"""
+        problem_sizes = []
+        strides_abc = []
+        ptrs_abc = []
+        gate_outputs = []
+        up_outputs = []
+
+        # Extract valid sizes and offsets (keep on GPU as long as possible)
+        valid_sizes = m_sizes_gpu[valid_indices]
+        valid_offsets = (
+            m_offsets_gpu[valid_indices]
+            if len(m_offsets_gpu) > len(valid_indices)
+            else torch.cumsum(
+                torch.cat([torch.tensor([0], device=device), valid_sizes[:-1]]), dim=0
+            )
+        )
+
+        # Filter out zero-size experts on GPU before any CPU transfer
+        nonzero_mask = valid_sizes > 0
+        if not torch.any(nonzero_mask).item():  # Only sync needed for early exit
+            return problem_sizes, strides_abc, ptrs_abc, gate_outputs, up_outputs
+
+        # Apply mask to get final valid experts
+        final_valid_indices = valid_indices[nonzero_mask]
+        final_valid_sizes = valid_sizes[nonzero_mask]
+        final_valid_offsets = valid_offsets[nonzero_mask]
+
+        # Single batch CPU transfer at the end
+        final_indices_cpu = final_valid_indices.cpu()
+        final_sizes_cpu = final_valid_sizes.cpu()
+        final_offsets_cpu = final_valid_offsets.cpu()
+
+        # Convert to lists once
+        indices_list = final_indices_cpu.tolist()
+        sizes_list = final_sizes_cpu.tolist()
+        offsets_list = final_offsets_cpu.tolist()
+
+        # Now iterate with pre-transferred data
+        for expert_idx, size, offset in zip(indices_list, sizes_list, offsets_list):
+            # Get expert data
+            expert_tokens = input_tokens[offset : offset + size].contiguous()
+            # Pre-transposed weights: [in_dim, out_dim] - no transpose needed
+            gate_weight = gate_weights[expert_idx].contiguous()
+            up_weight = up_weights[expert_idx].contiguous()
+
+            M, K = expert_tokens.shape
+            N = gate_weight.shape[1]  # output dimension is now second dimension
+            L = 1
+
+            # Create output tensors
+            gate_output = torch.empty(M, N, dtype=self.DTYPE_TORCH, device=device)
+            up_output = torch.empty(M, N, dtype=self.DTYPE_TORCH, device=device)
+
+            # Add both projections to metadata
+            for weight, output, output_list in [
+                (gate_weight, gate_output, gate_outputs),
+                (up_weight, up_output, up_outputs),
+            ]:
+                self._add_projection_to_metadata(
+                    expert_tokens,
+                    weight,
+                    output,
+                    problem_sizes,
+                    strides_abc,
+                    ptrs_abc,
+                )
+                output_list.append(output)
+
+        return problem_sizes, strides_abc, ptrs_abc, gate_outputs, up_outputs
+
+    def _execute_down_projection_gpu(
+        self, hidden_states, down_weights, m_sizes_gpu, device
+    ):
+        """Execute down projection using GPU operations with pre-transposed weights."""
+        if not hidden_states:
+            return []
+
+        # Find valid experts
+        valid_mask = m_sizes_gpu > 0
+        valid_indices = torch.nonzero(valid_mask, as_tuple=True)[0]
+
+        # Prepare metadata
+        problem_sizes, strides_abc, ptrs_abc, down_outputs = (
+            self._prepare_down_metadata_gpu(
+                hidden_states, down_weights, valid_indices, device
+            )
+        )
+
+        if len(problem_sizes) == 0:
+            return []
+
+        # Execute grouped GEMM
+        self._execute_grouped_gemm(problem_sizes, strides_abc, ptrs_abc, device)
+
+        return down_outputs
+
+    def _prepare_down_metadata_gpu(
+        self, hidden_states, down_weights, valid_indices, device
+    ):
+        """Prepare metadata for down projection with minimal CPU-GPU sync"""
+        problem_sizes = []
+        strides_abc = []
+        ptrs_abc = []
+        down_outputs = []
+
+        # Filter valid indices to match hidden states length on GPU
+        num_hidden_states = len(hidden_states)
+        if num_hidden_states == 0:
+            return problem_sizes, strides_abc, ptrs_abc, down_outputs
+
+        # Limit valid indices to available hidden states (GPU operation)
+        valid_indices_limited = valid_indices[:num_hidden_states]
+
+        # Single batch CPU transfer
+        valid_indices_cpu = valid_indices_limited.cpu().tolist()
+
+        for i, expert_idx in enumerate(valid_indices_cpu):
+            if i < num_hidden_states:
+                hidden = hidden_states[i]
+                # Pre-transposed down weights: original [hidden_size, intermediate_size] -> [intermediate_size, hidden_size]
+                down_weight = down_weights[expert_idx].contiguous()
+
+                # Debug logging
+                self._debug_log_shapes(
+                    f"Down projection expert {expert_idx}",
+                    hidden=hidden,
+                    down_weight=down_weight,
+                )
+
+                M, K = hidden.shape  # M = batch_size, K = intermediate_size
+                K_weight, N = (
+                    down_weight.shape
+                )  # K_weight = intermediate_size, N = hidden_size
+
+                # Verify dimension compatibility
+                if K != K_weight:
+                    raise ValueError(
+                        f"Dimension mismatch in down projection: "
+                        f"hidden states have {K} features but down_weight expects {K_weight} input features. "
+                        f"Hidden shape: {hidden.shape}, Down weight shape: {down_weight.shape}"
+                    )
+
+                # Create output tensor
+                down_output = torch.empty(M, N, dtype=self.DTYPE_TORCH, device=device)
+                down_outputs.append(down_output)
+
+                # Add to metadata
+                self._add_projection_to_metadata(
+                    hidden,
+                    down_weight,
+                    down_output,
+                    problem_sizes,
+                    strides_abc,
+                    ptrs_abc,
+                )
+
+        return problem_sizes, strides_abc, ptrs_abc, down_outputs
+
+    def _add_projection_to_metadata(
+        self,
+        input_tensor,
+        weight_tensor,
+        output_tensor,
+        problem_sizes,
+        strides_abc,
+        ptrs_abc,
+    ):
+        """Add a single projection to the metadata lists (assumes pre-transposed weights)."""
+        M, K = input_tensor.shape  # M = batch_size, K = input_features
+        K_weight, N = (
+            weight_tensor.shape
+        )  # K_weight = input_features, N = output_features
+        L = 1
+
+        # Verify dimension compatibility for matrix multiplication
+        if K != K_weight:
+            raise ValueError(
+                f"Matrix multiplication dimension mismatch: "
+                f"input has {K} features but weight expects {K_weight} input features. "
+                f"Input shape: {input_tensor.shape}, Weight shape: {weight_tensor.shape}"
+            )
+
+        # Convert to MNKL format
+        input_mnkl = input_tensor.unsqueeze(-1).contiguous()
+        weight_mnkl = weight_tensor.unsqueeze(-1).contiguous()
+        output_mnkl = output_tensor.unsqueeze(-1).contiguous()
+
+        # Extract strides
+        input_strides = list(input_mnkl.stride()[:2])
+        weight_strides = list(weight_mnkl.stride()[:2])
+        output_strides = list(output_mnkl.stride()[:2])
+
+        # Add to metadata
+        problem_sizes.append([M, N, K, L])
+        strides_abc.append([input_strides, weight_strides, output_strides])
+        ptrs_abc.append(
+            [
+                input_tensor.data_ptr(),
+                weight_tensor.data_ptr(),
+                output_tensor.data_ptr(),
+            ]
+        )
+
+    def _execute_grouped_gemm(self, problem_sizes, strides_abc, ptrs_abc, device):
+        """Execute the grouped GEMM kernel."""
+        num_groups = len(problem_sizes)
+
+        # Convert to CUTE tensors
+        problem_sizes_cute, strides_cute, ptrs_cute = self._convert_to_cute_tensors(
+            problem_sizes, strides_abc, ptrs_abc, device
+        )
+
+        # Get tensormap and compute clusters
+        tensormap_cute = self._get_tensormap_buffer(device)
+        total_clusters = self._compute_total_clusters(problem_sizes)
+
+        # Get initial tensors for compilation
+        initial_tensors = self._create_initial_tensors(problem_sizes[0], device)
+
+        # Compile or retrieve kernel
+        compiled_kernel = self._get_compiled_kernel(
+            num_groups,
+            total_clusters,
+            initial_tensors,
+            problem_sizes_cute,
+            strides_cute,
+            ptrs_cute,
+            tensormap_cute,
+        )
+
+        # Execute kernel
+        compiled_kernel(
+            *initial_tensors,
+            problem_sizes_cute,
+            strides_cute,
+            ptrs_cute,
+            tensormap_cute,
+            self.stream,
+        )
+        torch.cuda.synchronize()
+
+    def _convert_to_cute_tensors(self, problem_sizes, strides_abc, ptrs_abc, device):
+        """Convert metadata to CUTE tensors."""
+        problem_sizes_tensor = torch.tensor(
+            problem_sizes, dtype=torch.int32, device=device
+        )
+        strides_tensor = torch.tensor(strides_abc, dtype=torch.int32, device=device)
+        ptrs_tensor = torch.tensor(ptrs_abc, dtype=torch.int64, device=device)
+
+        return (
+            from_dlpack(problem_sizes_tensor, assumed_align=self.ALIGNMENT),
+            from_dlpack(strides_tensor, assumed_align=self.ALIGNMENT),
+            from_dlpack(ptrs_tensor, assumed_align=self.ALIGNMENT),
+        )
+
+    def _get_compiled_kernel(
+        self,
+        num_groups,
+        total_clusters,
+        initial_tensors,
+        problem_sizes_cute,
+        strides_cute,
+        ptrs_cute,
+        tensormap_cute,
+    ):
+        """Get or compile the grouped GEMM kernel."""
+        cache_key = (
+            num_groups,
+            total_clusters,
+            self.use_2cta_instrs,
+            self.mma_tiler_mn,
+            self.cluster_shape_mn,
+        )
+
+        if cache_key not in self._compiled_kernels:
+            logger.info(
+                f"Compiling CUTLASS grouped GEMM kernel: {num_groups} groups, 2CTA={self.use_2cta_instrs}, cluster={self.cluster_shape_mn}, Pre-transposed weights"
+            )
+
+            self._compiled_kernels[cache_key] = cute.compile(
+                self.grouped_gemm,
+                *initial_tensors,
+                num_groups,
+                problem_sizes_cute,
+                strides_cute,
+                ptrs_cute,
+                total_clusters,
+                tensormap_cute,
+                self.max_active_clusters,
+                self.stream,
+            )
+            logger.info("Kernel compilation successful")
+
+        return self._compiled_kernels[cache_key]
+
+    def _create_initial_tensors(self, problem_shape, device):
+        """Create initial CUTE tensors for kernel compilation."""
+        M, N, K, L = problem_shape
+
+        # Create tensors with pre-transposed weight layout
+        tensors = [
+            torch.randn(M, K, dtype=self.DTYPE_TORCH, device=device),  # A (input)
+            torch.randn(
+                K, N, dtype=self.DTYPE_TORCH, device=device
+            ),  # B (pre-transposed weight)
+            torch.zeros(M, N, dtype=self.DTYPE_TORCH, device=device),  # C (output)
+        ]
+
+        # Convert to MNKL format and create CUTE tensors
+        cute_tensors = []
+        for tensor in tensors:
+            mnkl_tensor = tensor.unsqueeze(-1).contiguous()
+            cute_tensor = from_dlpack(mnkl_tensor, assumed_align=self.ALIGNMENT)
+            cute_tensor.element_type = self.DTYPE_CUTLASS
+            cute_tensor = cute_tensor.mark_layout_dynamic(leading_dim=1)
+            cute_tensors.append(cute_tensor)
+
+        return cute_tensors
+
+    def _get_tensormap_buffer(self, device):
+        """Get or create tensormap buffer."""
+        if device not in self._tensormap_buffers:
+            sm_count = self.hardware_info.get_max_active_clusters(1)
+            tensormap_tensor = torch.zeros(
+                (sm_count, self.TENSORMAP_COUNT, self.TENSORMAP_BYTES // 8),
+                dtype=torch.int64,
+                device=device,
+            )
+            self._tensormap_buffers[device] = from_dlpack(
+                tensormap_tensor, assumed_align=self.ALIGNMENT
+            )
+
+        return self._tensormap_buffers[device]
+
+    def _compute_total_clusters(self, problem_sizes):
+        """Compute total number of clusters needed."""
+        cluster_tile_m = self.mma_tiler_mn[0]
+        cluster_tile_n = self.mma_tiler_mn[1]
+
+        # Adjust for 2 CTA mode and cluster shape
+        if self.use_2cta_instrs:
+            cluster_tile_m //= 2
+
+        cluster_tile_m *= self.cluster_shape_mn[0]
+        cluster_tile_n *= self.cluster_shape_mn[1]
+
+        total = 0
+        for M, N, K, L in problem_sizes:
+            clusters_m = (M + cluster_tile_m - 1) // cluster_tile_m
+            clusters_n = (N + cluster_tile_n - 1) // cluster_tile_n
+            total += clusters_m * clusters_n
+
+        return total
+
+    def _apply_activation_and_combine(self, gate_outputs, up_outputs):
+        """Apply activation and combine gate/up outputs."""
+        return [
+            self.activation_function(gate_out) * up_out
+            for gate_out, up_out in zip(gate_outputs, up_outputs)
+        ]
+
+    def _reconstruct_output_gpu(
+        self, final_outputs, m_sizes_gpu, m_offsets_gpu, output
+    ):
+        """Reconstruct the full output tensor with minimal CPU-GPU sync"""
+        if not final_outputs:
+            return output
+
+        # Find valid experts on GPU
+        valid_mask = m_sizes_gpu > 0
+        valid_indices = torch.nonzero(valid_mask, as_tuple=True)[0]
+        valid_sizes = m_sizes_gpu[valid_indices]
+
+        # Filter to match final_outputs length
+        num_outputs = len(final_outputs)
+        if num_outputs == 0:
+            return output
+
+        # Limit to available outputs
+        valid_indices_limited = valid_indices[:num_outputs]
+        valid_sizes_limited = valid_sizes[:num_outputs]
+
+        # Compute offsets if not provided properly (GPU operations)
+        if len(m_offsets_gpu) <= len(valid_indices_limited):
+            valid_offsets_limited = torch.cumsum(
+                torch.cat(
+                    [
+                        torch.tensor([0], device=m_sizes_gpu.device),
+                        valid_sizes_limited[:-1],
+                    ]
+                ),
+                dim=0,
+            )
+        else:
+            valid_offsets_limited = m_offsets_gpu[valid_indices_limited]
+
+        # Single batch CPU transfer for reconstruction
+        valid_sizes_cpu = valid_sizes_limited.cpu().tolist()
+        valid_offsets_cpu = valid_offsets_limited.cpu().tolist()
+
+        # Reconstruct output using pre-transferred data
+        for i, (size, offset) in enumerate(zip(valid_sizes_cpu, valid_offsets_cpu)):
+            if i < len(final_outputs):
+                output[offset : offset + size] = final_outputs[i]
+
+        return output
+
+    @staticmethod
+    def is_available() -> bool:
+        """Check if CUTLASS is available on the current system."""
+        return HAS_CUTLASS
diff --git a/torchtitan/experiments/deepseek_v3/generate.py b/torchtitan/experiments/deepseek_v3/generate.py
index 4c26052b3..b2fe217a8 100644
--- a/torchtitan/experiments/deepseek_v3/generate.py
+++ b/torchtitan/experiments/deepseek_v3/generate.py
@@ -8,6 +8,7 @@
 
 # use inference.sh "Your Question Here?" to run inference with a single prompt.
 
+import os
 import sys
 from dataclasses import dataclass
 
@@ -353,6 +354,8 @@ def generate_with_cuda_graph(
 
 
 if __name__ == "__main__":
+    rank = int(os.environ.get("RANK", "0"))
+    torch.cuda.set_device(rank)
     # Get user prompt from command line arguments
     user_prompt = "What is 2+2?"  # Default prompt
     if len(sys.argv) > 1:
diff --git a/torchtitan/experiments/deepseek_v3/group_gemms.py b/torchtitan/experiments/deepseek_v3/group_gemms.py
index 753cb86b2..41344ee5b 100644
--- a/torchtitan/experiments/deepseek_v3/group_gemms.py
+++ b/torchtitan/experiments/deepseek_v3/group_gemms.py
@@ -70,6 +70,7 @@
 
 import logging
 
+
 logger = logging.getLogger(__name__)
 
 
@@ -122,8 +123,9 @@ def is_available() -> bool:
     "TorchBF16GroupGEMM",
     "TorchAOBF16GroupGEMM",
     "TritonCGBF16GroupGEMM",
-    "CUTLASSGroupedGemmStrategy",
+    # "CUTLASSGroupedGemmStrategy",
     "ManualLoopGroupGEMM",
+    # "ImprovedCUTLASSGroupedGemmStrategy",
 ]
 
 
@@ -187,7 +189,671 @@ def is_available() -> bool:
         return True
 
 
-class CUTLASSGroupedGemmStrategy(GroupGEMMStrategy):
+class CUTLASSGroupedGemmStrategy_down_not_working(GroupGEMMStrategy):
+    """
+    Strategy using CUTLASS GroupedGemmKernel for group GEMM operations on Blackwell architecture.
+
+    This version pre-transposes weights during arrangement and eliminates CPU-GPU synchronization
+    by keeping all size/offset computations on GPU.
+    """
+
+    # Constants (same as before)
+    SUPPORTED_CLUSTER_SHAPES = [
+        (1, 1),
+        (1, 2),
+        (1, 4),
+        (2, 1),
+        (2, 2),
+        (2, 4),
+        (4, 1),
+        (4, 2),
+        (4, 4),
+    ]
+
+    SINGLE_CTA_M_SIZES = [128, 64]
+    DUAL_CTA_M_SIZES = [256, 128]
+    N_SIZE_RANGE = range(32, 257, 32)  # 32 - 256, step 32
+
+    DTYPE_TORCH = torch.bfloat16
+    DTYPE_CUTLASS = cutlass.BFloat16
+    ACC_DTYPE = cutlass.Float32
+    ALIGNMENT = 16
+    TENSORMAP_COUNT = 3
+    TENSORMAP_BYTES = 128
+
+    def __init__(
+        self,
+        custom_activation,
+        use_2cta_instrs=True,
+        mma_tiler_mn=(256, 128),
+        cluster_shape_mn=(4, 4),
+        validate=True,
+    ):
+        """Initialize the CUTLASS grouped GEMM strategy for Blackwell architecture."""
+        super().__init__(custom_activation)
+        self.use_2cta_instrs = use_2cta_instrs
+        self.validate = validate
+
+        # Set configuration
+        self.mma_tiler_mn = mma_tiler_mn or self._get_default_mma_tiler()
+        self.cluster_shape_mn = cluster_shape_mn or self._get_default_cluster_shape()
+
+        # Validate configurations only if validation is enabled
+        if self.validate:
+            self._validate_configurations()
+
+        # Initialize kernel and hardware info
+        self._initialize_kernel()
+        self._initialize_hardware()
+
+        # Initialize caches
+        self._compiled_kernels = {}
+        self._tensormap_buffers = {}
+
+        self._log_initialization()
+
+    def _get_default_mma_tiler(self):
+        """Get default MMA tiler configuration based on CTA mode."""
+        return (256, 128) if self.use_2cta_instrs else (128, 128)
+
+    def _get_default_cluster_shape(self):
+        """Get default cluster shape based on CTA mode."""
+        return (2, 2) if self.use_2cta_instrs else (1, 1)
+
+    def _initialize_kernel(self):
+        """Initialize the CUTLASS grouped GEMM kernel."""
+        self.grouped_gemm = GroupedGemmKernel(
+            acc_dtype=self.ACC_DTYPE,
+            use_2cta_instrs=self.use_2cta_instrs,
+            mma_tiler_mn=self.mma_tiler_mn,
+            cluster_shape_mn=self.cluster_shape_mn,
+            tensormap_update_mode=utils.TensorMapUpdateMode.SMEM,
+        )
+
+    def _initialize_hardware(self):
+        """Initialize hardware information and stream."""
+        self.hardware_info = utils.HardwareInfo()
+        self.max_active_clusters = self.hardware_info.get_max_active_clusters(
+            self.cluster_shape_mn[0] * self.cluster_shape_mn[1]
+        )
+
+        torch_stream = torch.cuda.current_stream()
+        self.stream = cuda.CUstream(torch_stream.cuda_stream)
+
+    def _validate_configurations(self):
+        """Validate configurations for Blackwell."""
+        self._validate_mma_tiler()
+        self._validate_cluster_shape()
+        self._validate_2cta_constraints()
+
+    def _validate_mma_tiler(self):
+        """Validate MMA tiler configuration."""
+        m_size, n_size = self.mma_tiler_mn
+
+        valid_m_sizes = (
+            self.DUAL_CTA_M_SIZES if self.use_2cta_instrs else self.SINGLE_CTA_M_SIZES
+        )
+        mode_name = "2 CTA" if self.use_2cta_instrs else "single CTA"
+
+        if m_size not in valid_m_sizes:
+            raise ValueError(
+                f"For {mode_name} mode on Blackwell, MMA tiler M must be in {valid_m_sizes}, got {m_size}"
+            )
+
+        if n_size not in self.N_SIZE_RANGE:
+            raise ValueError(
+                f"MMA tiler N must be in range [32, 256] with step 32, got {n_size}"
+            )
+
+    def _validate_cluster_shape(self):
+        """Validate cluster shape configuration."""
+        if self.cluster_shape_mn not in self.SUPPORTED_CLUSTER_SHAPES:
+            raise ValueError(
+                f"Cluster shape {self.cluster_shape_mn} not supported on Blackwell. "
+                f"Valid cluster shapes are: {self.SUPPORTED_CLUSTER_SHAPES}"
+            )
+
+    def _validate_2cta_constraints(self):
+        """Validate 2 CTA specific constraints."""
+        if self.use_2cta_instrs and self.cluster_shape_mn[0] % 2 != 0:
+            valid_2cta_shapes = [
+                shape for shape in self.SUPPORTED_CLUSTER_SHAPES if shape[0] % 2 == 0
+            ]
+            raise ValueError(
+                f"For 2 CTA mode, cluster shape M must be even, got {self.cluster_shape_mn[0]}. "
+                f"Valid 2 CTA cluster shapes: {valid_2cta_shapes}"
+            )
+
+    def _log_initialization(self):
+        """Log initialization information."""
+        cluster_size = self.cluster_shape_mn[0] * self.cluster_shape_mn[1]
+        print(f"Initialized CUTLASSGroupedGemmStrategy for Blackwell with:")
+        print(f"  - 2 CTA instructions: {self.use_2cta_instrs}")
+        print(f"  - MMA tiler (M, N): {self.mma_tiler_mn}")
+        print(f"  - Cluster shape (M, N): {self.cluster_shape_mn}")
+        print(f"  - Cluster size: {cluster_size}")
+        print(f"  - Pre-transposed weights: Enabled")
+        print(f"  - Input validation: {'Enabled' if self.validate else 'Disabled'}")
+        if cluster_size > 1:
+            print(f"  - Using multi-CTA parallelism")
+
+    def arrange_expert_weights(self, all_weights, submod_name, module):
+        """Store weights in pre-transposed stacked format for optimal CUTLASS performance."""
+        # Pre-transpose weights from [out_dim, in_dim] to [in_dim, out_dim]
+        # This eliminates the need for transpose operations during execution
+        transposed_weights = [w.t().contiguous() for w in all_weights]
+        print(f"Pre-transposing weights for {submod_name} module")
+        return torch.stack(transposed_weights)
+
+    def execute(self, contig_tokens, m_sizes, m_offsets, module):
+        """
+        Execute using CUTLASS grouped GEMM kernel with pre-transposed weights.
+
+        Args:
+            contig_tokens: Input tokens arranged contiguously by expert
+            m_sizes: Tensor of expert sizes (GPU tensor to avoid sync)
+            m_offsets: Tensor of expert offsets (GPU tensor to avoid sync)
+            module: MoE module containing pre-transposed weights
+        """
+        print(f"gpu tensor conversion next")
+        # Convert to GPU tensors if needed (avoid CPU-GPU sync)
+        m_sizes_gpu, m_offsets_gpu = self._ensure_gpu_tensors(
+            m_sizes, m_offsets, contig_tokens.device
+        )
+
+        print(f"validate inputs")
+        # Validate inputs only if validation is enabled
+        if self.validate:
+            self._validate_inputs(contig_tokens, m_sizes_gpu, module)
+
+        # Get pre-transposed weights and device
+        print(f"get weights")
+        weights = self._get_weights(module)
+        device = contig_tokens.device
+
+        # Prepare output tensor
+        print(f"prepare output tensor")
+        output = torch.zeros(
+            contig_tokens.shape[0],
+            weights["gate"].shape[2],  # output dimension is now last dimension
+            dtype=self.DTYPE_TORCH,
+            device=device,
+        )
+
+        # Check for valid experts using GPU operations (no sync)
+        if not self._has_valid_experts_gpu(m_sizes_gpu):
+            return output
+
+        # Execute the three-stage computation using GPU-only operations with pre-transposed weights
+        print(f"executing gate and up projections for {module} module")
+        gate_outputs, up_outputs = self._execute_projections_gpu(
+            contig_tokens,
+            weights["gate"],
+            weights["up"],
+            m_sizes_gpu,
+            m_offsets_gpu,
+            device,
+        )
+
+        print(f"hiddent states next")
+        hidden_states = self._apply_activation_and_combine(gate_outputs, up_outputs)
+
+        print(f"down projection next")
+
+        final_outputs = self._execute_down_projection_gpu(
+            hidden_states, weights["down"], m_sizes_gpu, device
+        )
+        print(f"final outputs next")
+        return self._reconstruct_output_gpu(
+            final_outputs, m_sizes_gpu, m_offsets_gpu, output
+        )
+
+    def _ensure_gpu_tensors(self, m_sizes, m_offsets, device):
+        """Ensure sizes and offsets are GPU tensors to avoid CPU-GPU sync."""
+        if not isinstance(m_sizes, torch.Tensor):
+            m_sizes_gpu = torch.tensor(m_sizes, dtype=torch.int32, device=device)
+        else:
+            m_sizes_gpu = m_sizes.to(device=device, dtype=torch.int32)
+
+        if not isinstance(m_offsets, torch.Tensor):
+            m_offsets_gpu = torch.tensor(m_offsets, dtype=torch.int32, device=device)
+        else:
+            m_offsets_gpu = m_offsets.to(device=device, dtype=torch.int32)
+
+        return m_sizes_gpu, m_offsets_gpu
+
+    def _has_valid_experts_gpu(self, m_sizes_gpu):
+        """Check if any experts have tokens using GPU operations (no sync)."""
+        return torch.any(
+            m_sizes_gpu > 0
+        ).item()  # Single sync here is unavoidable for control flow
+
+    def _validate_inputs(self, contig_tokens, m_sizes_gpu, module):
+        """Validate input parameters."""
+        if contig_tokens.dtype != self.DTYPE_TORCH:
+            raise ValueError(
+                f"Expected input dtype {self.DTYPE_TORCH}, got {contig_tokens.dtype}"
+            )
+
+        if len(contig_tokens.shape) != 2:
+            raise ValueError(
+                f"Expected 2D input tensor, got shape {contig_tokens.shape}"
+            )
+
+        required_params = ["gate_proj_weight", "up_proj_weight", "down_proj_weight"]
+        for param in required_params:
+            if not hasattr(module, param) or module.get_parameter(param) is None:
+                raise ValueError(f"Module missing required parameter: {param}")
+
+    def _get_weights(self, module):
+        """Extract and return pre-transposed weight tensors from module."""
+        return {
+            "gate": module.get_parameter(
+                "gate_proj_weight"
+            ),  # [num_experts, in_dim, out_dim]
+            "up": module.get_parameter("up_proj_weight"),
+            "down": module.get_parameter("down_proj_weight"),
+        }
+
+    def _execute_projections_gpu(
+        self, input_tokens, weight1, weight2, m_sizes_gpu, m_offsets_gpu, device
+    ):
+        """Execute gate and up projections using GPU-only operations with pre-transposed weights."""
+        # Find valid experts using GPU operations
+        valid_mask = m_sizes_gpu > 0
+        valid_indices = torch.nonzero(valid_mask, as_tuple=True)[0]
+
+        if len(valid_indices) == 0:
+            return [], []
+
+        # Prepare metadata in batch using GPU operations
+        problem_sizes, strides_abc, ptrs_abc, gate_outputs, up_outputs = (
+            self._prepare_gate_up_metadata_gpu(
+                input_tokens,
+                weight1,
+                weight2,
+                m_sizes_gpu,
+                m_offsets_gpu,
+                valid_indices,
+                device,
+            )
+        )
+
+        if len(problem_sizes) == 0:
+            return [], []
+
+        # Execute grouped GEMM
+        self._execute_grouped_gemm(problem_sizes, strides_abc, ptrs_abc, device)
+
+        return gate_outputs, up_outputs
+
+    def _prepare_gate_up_metadata_gpu(
+        self,
+        input_tokens,
+        gate_weights,
+        up_weights,
+        m_sizes_gpu,
+        m_offsets_gpu,
+        valid_indices,
+        device,
+    ):
+        """Prepare metadata for gate and up projections with pre-transposed weights"""
+        problem_sizes = []
+        strides_abc = []
+        ptrs_abc = []
+        gate_outputs = []
+        up_outputs = []
+
+        # Extract valid sizes and offsets (minimal sync - only for valid experts)
+        valid_sizes = m_sizes_gpu[valid_indices]
+        valid_offsets = (
+            m_offsets_gpu[valid_indices]
+            if len(m_offsets_gpu) > len(valid_indices)
+            else torch.cumsum(
+                torch.cat([torch.tensor([0], device=device), valid_sizes[:-1]]), dim=0
+            )
+        )
+
+        # Convert to Python for iteration (unavoidable in this test for metadata preparation)
+        valid_sizes_cpu = valid_sizes.cpu().tolist()
+        valid_offsets_cpu = valid_offsets.cpu().tolist()
+        valid_indices_cpu = valid_indices.cpu().tolist()
+
+        for i, (expert_idx, size, offset) in enumerate(
+            zip(valid_indices_cpu, valid_sizes_cpu, valid_offsets_cpu)
+        ):
+            if size > 0:
+                # Get expert data
+                expert_tokens = input_tokens[offset : offset + size].contiguous()
+                # Pre-transposed weights: [in_dim, out_dim] - no transpose needed
+                gate_weight = gate_weights[expert_idx].contiguous()
+                up_weight = up_weights[expert_idx].contiguous()
+
+                M, K = expert_tokens.shape
+                N = gate_weight.shape[1]  # output dimension is now second dimension
+                L = 1
+
+                # Create output tensors
+                gate_output = torch.empty(M, N, dtype=self.DTYPE_TORCH, device=device)
+                up_output = torch.empty(M, N, dtype=self.DTYPE_TORCH, device=device)
+
+                # Add both projections to metadata
+                for weight, output, output_list in [
+                    (gate_weight, gate_output, gate_outputs),
+                    (up_weight, up_output, up_outputs),
+                ]:
+                    self._add_projection_to_metadata(
+                        expert_tokens,
+                        weight,
+                        output,
+                        problem_sizes,
+                        strides_abc,
+                        ptrs_abc,
+                    )
+                    output_list.append(output)
+
+        return problem_sizes, strides_abc, ptrs_abc, gate_outputs, up_outputs
+
+    def _execute_down_projection_gpu(
+        self, hidden_states, down_weights, m_sizes_gpu, device
+    ):
+        """Execute down projection using GPU operations with pre-transposed weights."""
+        if not hidden_states:
+            return []
+        print(f"{hidden_states=}, {down_weights=}")
+        assert hidden_states.shape[1] == down_weights.shape[1]
+        assert False, "check weights hidden"
+        # Find valid experts
+        valid_mask = m_sizes_gpu > 0
+        valid_indices = torch.nonzero(valid_mask, as_tuple=True)[0]
+
+        # Prepare metadata
+        problem_sizes, strides_abc, ptrs_abc, down_outputs = (
+            self._prepare_down_metadata_gpu(
+                hidden_states, down_weights, valid_indices, device
+            )
+        )
+
+        if len(problem_sizes) == 0:
+            return []
+
+        # Execute grouped GEMM
+        self._execute_grouped_gemm(problem_sizes, strides_abc, ptrs_abc, device)
+
+        return down_outputs
+
+    def _prepare_down_metadata_gpu(
+        self, hidden_states, down_weights, valid_indices, device
+    ):
+        """Prepare metadata for down projection using GPU operations with pre-transposed weights."""
+        problem_sizes = []
+        strides_abc = []
+        ptrs_abc = []
+        down_outputs = []
+
+        # Convert indices to CPU for iteration (minimal sync)
+        valid_indices_cpu = valid_indices.cpu().tolist()
+
+        for i, expert_idx in enumerate(valid_indices_cpu):
+            if i < len(hidden_states):
+                hidden = hidden_states[i]
+                # Pre-transposed weights: [in_dim, out_dim] - no transpose needed
+                down_weight = down_weights[expert_idx].contiguous()
+
+                M, K = hidden.shape
+                N = down_weight.shape[1]  # output dimension is now second dimension
+
+                # Create output tensor
+                down_output = torch.empty(M, N, dtype=self.DTYPE_TORCH, device=device)
+                down_outputs.append(down_output)
+
+                # Add to metadata
+                self._add_projection_to_metadata(
+                    hidden,
+                    down_weight,
+                    down_output,
+                    problem_sizes,
+                    strides_abc,
+                    ptrs_abc,
+                )
+
+        return problem_sizes, strides_abc, ptrs_abc, down_outputs
+
+    def _add_projection_to_metadata(
+        self,
+        input_tensor,
+        weight_tensor,
+        output_tensor,
+        problem_sizes,
+        strides_abc,
+        ptrs_abc,
+    ):
+        """Add a single projection to the metadata lists (assumes pre-transposed weights)."""
+        M, K = input_tensor.shape
+        N = weight_tensor.shape[1]  # output dimension from pre-transposed weight
+        L = 1
+
+        # Convert to MNKL format
+        input_mnkl = input_tensor.unsqueeze(-1).contiguous()
+        weight_mnkl = weight_tensor.unsqueeze(-1).contiguous()
+        output_mnkl = output_tensor.unsqueeze(-1).contiguous()
+
+        # Extract strides
+        input_strides = list(input_mnkl.stride()[:2])
+        weight_strides = list(weight_mnkl.stride()[:2])
+        output_strides = list(output_mnkl.stride()[:2])
+
+        # Add to metadata
+        problem_sizes.append([M, N, K, L])
+        strides_abc.append([input_strides, weight_strides, output_strides])
+        ptrs_abc.append(
+            [
+                input_tensor.data_ptr(),
+                weight_tensor.data_ptr(),
+                output_tensor.data_ptr(),
+            ]
+        )
+
+    def _execute_grouped_gemm(self, problem_sizes, strides_abc, ptrs_abc, device):
+        """Execute the grouped GEMM kernel."""
+        num_groups = len(problem_sizes)
+
+        # Convert to CUTE tensors
+        problem_sizes_cute, strides_cute, ptrs_cute = self._convert_to_cute_tensors(
+            problem_sizes, strides_abc, ptrs_abc, device
+        )
+
+        # Get tensormap and compute clusters
+        tensormap_cute = self._get_tensormap_buffer(device)
+        total_clusters = self._compute_total_clusters(problem_sizes)
+
+        # Get initial tensors for compilation
+        initial_tensors = self._create_initial_tensors(problem_sizes[0], device)
+
+        # Compile or retrieve kernel
+        compiled_kernel = self._get_compiled_kernel(
+            num_groups,
+            total_clusters,
+            initial_tensors,
+            problem_sizes_cute,
+            strides_cute,
+            ptrs_cute,
+            tensormap_cute,
+        )
+
+        # Execute kernel
+        compiled_kernel(
+            *initial_tensors,
+            problem_sizes_cute,
+            strides_cute,
+            ptrs_cute,
+            tensormap_cute,
+            self.stream,
+        )
+        torch.cuda.synchronize()
+
+    def _convert_to_cute_tensors(self, problem_sizes, strides_abc, ptrs_abc, device):
+        """Convert metadata to CUTE tensors."""
+        problem_sizes_tensor = torch.tensor(
+            problem_sizes, dtype=torch.int32, device=device
+        )
+        strides_tensor = torch.tensor(strides_abc, dtype=torch.int32, device=device)
+        ptrs_tensor = torch.tensor(ptrs_abc, dtype=torch.int64, device=device)
+
+        return (
+            from_dlpack(problem_sizes_tensor, assumed_align=self.ALIGNMENT),
+            from_dlpack(strides_tensor, assumed_align=self.ALIGNMENT),
+            from_dlpack(ptrs_tensor, assumed_align=self.ALIGNMENT),
+        )
+
+    def _get_compiled_kernel(
+        self,
+        num_groups,
+        total_clusters,
+        initial_tensors,
+        problem_sizes_cute,
+        strides_cute,
+        ptrs_cute,
+        tensormap_cute,
+    ):
+        """Get or compile the grouped GEMM kernel."""
+        cache_key = (
+            num_groups,
+            total_clusters,
+            self.use_2cta_instrs,
+            self.mma_tiler_mn,
+            self.cluster_shape_mn,
+        )
+
+        if cache_key not in self._compiled_kernels:
+            print(
+                f"Compiling CUTLASS grouped GEMM kernel: {num_groups} groups, 2CTA={self.use_2cta_instrs}, cluster={self.cluster_shape_mn}, Pre-transposed weights"
+            )
+
+            self._compiled_kernels[cache_key] = cute.compile(
+                self.grouped_gemm,
+                *initial_tensors,
+                num_groups,
+                problem_sizes_cute,
+                strides_cute,
+                ptrs_cute,
+                total_clusters,
+                tensormap_cute,
+                self.max_active_clusters,
+                self.stream,
+            )
+            print("Kernel compilation successful")
+
+        return self._compiled_kernels[cache_key]
+
+    def _create_initial_tensors(self, problem_shape, device):
+        """Create initial CUTE tensors for kernel compilation."""
+        M, N, K, L = problem_shape
+
+        # Create tensors with pre-transposed weight layout
+        tensors = [
+            torch.randn(M, K, dtype=self.DTYPE_TORCH, device=device),  # A (input)
+            torch.randn(
+                K, N, dtype=self.DTYPE_TORCH, device=device
+            ),  # B (pre-transposed weight)
+            torch.zeros(M, N, dtype=self.DTYPE_TORCH, device=device),  # C (output)
+        ]
+
+        # Convert to MNKL format and create CUTE tensors
+        cute_tensors = []
+        for tensor in tensors:
+            mnkl_tensor = tensor.unsqueeze(-1).contiguous()
+            cute_tensor = from_dlpack(mnkl_tensor, assumed_align=self.ALIGNMENT)
+            cute_tensor.element_type = self.DTYPE_CUTLASS
+            cute_tensor = cute_tensor.mark_layout_dynamic(leading_dim=1)
+            cute_tensors.append(cute_tensor)
+
+        return cute_tensors
+
+    def _get_tensormap_buffer(self, device):
+        """Get or create tensormap buffer."""
+        if device not in self._tensormap_buffers:
+            sm_count = self.hardware_info.get_max_active_clusters(1)
+            tensormap_tensor = torch.zeros(
+                (sm_count, self.TENSORMAP_COUNT, self.TENSORMAP_BYTES // 8),
+                dtype=torch.int64,
+                device=device,
+            )
+            self._tensormap_buffers[device] = from_dlpack(
+                tensormap_tensor, assumed_align=self.ALIGNMENT
+            )
+
+        return self._tensormap_buffers[device]
+
+    def _compute_total_clusters(self, problem_sizes):
+        """Compute total number of clusters needed."""
+        cluster_tile_m = self.mma_tiler_mn[0]
+        cluster_tile_n = self.mma_tiler_mn[1]
+
+        # Adjust for 2 CTA mode and cluster shape
+        if self.use_2cta_instrs:
+            cluster_tile_m //= 2
+
+        cluster_tile_m *= self.cluster_shape_mn[0]
+        cluster_tile_n *= self.cluster_shape_mn[1]
+
+        total = 0
+        for M, N, K, L in problem_sizes:
+            clusters_m = (M + cluster_tile_m - 1) // cluster_tile_m
+            clusters_n = (N + cluster_tile_n - 1) // cluster_tile_n
+            total += clusters_m * clusters_n
+
+        return total
+
+    def _apply_activation_and_combine(self, gate_outputs, up_outputs):
+        """Apply activation and combine gate/up outputs."""
+        return [
+            self.activation_function(gate_out) * up_out
+            for gate_out, up_out in zip(gate_outputs, up_outputs)
+        ]
+
+    def _reconstruct_output_gpu(
+        self, final_outputs, m_sizes_gpu, m_offsets_gpu, output
+    ):
+        """Reconstruct the full output tensor using GPU operations (minimal sync)."""
+        if not final_outputs:
+            return output
+
+        # Find valid experts
+        valid_mask = m_sizes_gpu > 0
+        valid_indices = torch.nonzero(valid_mask, as_tuple=True)[0]
+        valid_sizes = m_sizes_gpu[valid_indices]
+
+        # Compute offsets if not provided properly
+        if len(m_offsets_gpu) <= len(valid_indices):
+            valid_offsets = torch.cumsum(
+                torch.cat(
+                    [torch.tensor([0], device=m_sizes_gpu.device), valid_sizes[:-1]]
+                ),
+                dim=0,
+            )
+        else:
+            valid_offsets = m_offsets_gpu[valid_indices]
+
+        # Convert to CPU for final reconstruction (minimal sync)
+        valid_sizes_cpu = valid_sizes.cpu().tolist()
+        valid_offsets_cpu = valid_offsets.cpu().tolist()
+
+        for i, (size, offset) in enumerate(zip(valid_sizes_cpu, valid_offsets_cpu)):
+            if i < len(final_outputs):
+                output[offset : offset + size] = final_outputs[i]
+
+        return output
+
+    @staticmethod
+    def is_available() -> bool:
+        return HAS_CUTLASS
+
+
+# ========================= end of CUTLASSGroupedGemmStrategy =========================
+
+
+class CUTLASSGroupedGemmStrategy_prev(GroupGEMMStrategy):
     """
     Strategy using CUTLASS GroupedGemmKernel for group GEMM operations on Blackwell architecture.
 
diff --git a/torchtitan/experiments/deepseek_v3/model.py b/torchtitan/experiments/deepseek_v3/model.py
index 82c9e4dc7..90b79e39d 100644
--- a/torchtitan/experiments/deepseek_v3/model.py
+++ b/torchtitan/experiments/deepseek_v3/model.py
@@ -45,7 +45,7 @@
 from attn_mask_utils import _prepare_4d_causal_attention_mask
 
 from group_gemms import (
-    CUTLASSGroupedGemmStrategy,
+    # CUTLASSGroupedGemmStrategy,
     DSGroupGEMM,
     ManualLoopGroupGEMM,
     TorchAOBF16GroupGEMM,
@@ -59,6 +59,14 @@
 from torch import nn
 from torch.distributed._functional_collectives import all_to_all_single_autograd
 
+from torchtitan.experiments.deepseek_v3.cute_interface_gg import (
+    ImprovedCUTLASSGroupedGemmStrategy,
+)
+
+from torchtitan.experiments.deepseek_v3.cutlass_grouped_gemm import (
+    CUTLASSGroupedGemmStrategy,
+)
+
 from torchtitan.experiments.kernels.moe.indices import generate_permute_indices
 from torchtitan.experiments.kernels.triton_mg_group_gemm.torchao_pr import ALIGN_SIZE_M
 
@@ -476,7 +484,7 @@ class MoE(nn.Module):
     # Group GEMM strategies
     group_gemm_strategies = None
     # which group gemm to use?
-    group_mm = "cutlass"  # fp8 options = ["torchfp8", "dsgemm"] bf16 = ["manual", "torch", , "torchao", "tritoncg"]
+    group_mm = "cutlass"  # fp8 options = ["torchfp8", "dsgemm"] bf16 = ["manual", "torch", , "torchao", "tritoncg"] "cutlass", "improvedCutlass"
 
     def __init__(self, config):
         super().__init__()
@@ -558,6 +566,11 @@ def _initialize_group_gemm_strategies(cls):
                 if CUTLASSGroupedGemmStrategy.is_available()
                 else None
             ),
+            "improvedCutlass": (
+                ImprovedCUTLASSGroupedGemmStrategy(MLP.act_fn)
+                if ImprovedCUTLASSGroupedGemmStrategy.is_available()
+                else None
+            ),
         }
 
     def combine_experts(self, submod_name: str):

From a34550889b0b9e096db02702ed1c3d9f6eec410f Mon Sep 17 00:00:00 2001
From: lessw2020 <lessw@etrillium.com>
Date: Sat, 21 Jun 2025 21:42:58 -0700
Subject: [PATCH 25/34] standalone running, but values incorrect

---
 .../deepseek_v3/cutlass_grouped_gemm.py       | 199 +++++++++++-------
 1 file changed, 122 insertions(+), 77 deletions(-)

diff --git a/torchtitan/experiments/deepseek_v3/cutlass_grouped_gemm.py b/torchtitan/experiments/deepseek_v3/cutlass_grouped_gemm.py
index cdfc2673a..470b730cc 100644
--- a/torchtitan/experiments/deepseek_v3/cutlass_grouped_gemm.py
+++ b/torchtitan/experiments/deepseek_v3/cutlass_grouped_gemm.py
@@ -9,12 +9,35 @@
 
 This module contains the CUTLASSGroupedGemmStrategy implementation that uses
 CUTLASS GroupedGemmKernel for high-performance group GEMM operations with
-pre-transposed weights and optional input validation.
+minimal CPU-GPU synchronization and optional input validation.
 """
 
 """
-current error:
-The expanded size of the tensor (1408) must match the existing size (2048) at non-singleton dimension 1.  Target sizes: [12288, 1408].  Tensor sizes: [12288, 2048]
+[DEBUG] Gate/Up projection expert 14
+  - expert_tokens: torch.Size([12288, 2048])
+  - gate_weight (after .t()): torch.Size([2048, 1408])
+  - up_weight (after .t()): torch.Size([2048, 1408])
+[DEBUG] Matrix mult: input torch.Size([12288, 2048]) @ weight torch.Size([2048, 1408]) -> output torch.Size([12288, 1408])
+[DEBUG] Gate/Up projection expert 15[DEBUG] Matrix mult: input torch.Size([12288, 2048]) @ weight torch.Size([2048, 1408]) -> output torch.Size([12288, 1408])
+
+  - expert_tokens: torch.Size([12288, 2048])
+  - gate_weight (after .t()): torch.Size([2048, 1408])
+  - up_weight (after .t()): torch.Size([2048, 1408])
+[DEBUG] Matrix mult: input torch.Size([12288, 2048]) @ weight torch.Size([2048, 1408]) -> output torch.Size([12288, 1408])
+[DEBUG] Matrix mult: input torch.Size([12288, 2048]) @ weight torch.Size([2048, 1408]) -> output torch.Size([12288, 1408])
+down projection:
+[DEBUG] Down projection expert 10[DEBUG] Matrix mult: input torch.Size([12288, 1408]) @ weight torch.Size([1408, 2048]) -> output torch.Size([12288, 2048])
+
+[DEBUG] Down projection expert 10
+  - hidden: torch.Size([12288, 1408])
+  - hidden: torch.Size([12288, 1408])  - down_weight (after .t()): torch.Size([1408, 2048])
+
+  - down_weight (after .t()): torch.Size([1408, 2048])
+[DEBUG] Matrix mult: input torch.Size([12288, 1408]) @ weight torch.Size([1408, 2048]) -> output torch.Size([12288, 2048])
+[DEBUG] Matrix mult: input torch.Size([12288, 1408]) @ weight torch.Size([1408, 2048]) -> output torch.Size([12288, 2048])
+
+
+
 
 """
 
@@ -40,7 +63,7 @@
     print(f"✗ CUTLASS import failed: {e}")
     print("CUTLASSGroupedGemmStrategy will not be available")
 
-# Import base class
+# Import base class - adjust path as needed based on your project structure
 from .group_gemms import GroupGEMMStrategy
 
 logger = logging.getLogger(__name__)
@@ -50,19 +73,6 @@ class CUTLASSGroupedGemmStrategy(GroupGEMMStrategy):
     """
     Strategy using CUTLASS GroupedGemmKernel for group GEMM operations on Blackwell architecture.
 
-    This version is optimized for minimal CPU-GPU synchronization with the following features:
-    - Pre-transposed weights during arrangement (eliminates runtime transpose overhead)
-    - GPU-first processing with batched CPU transfers when unavoidable
-    - Deferred synchronization until absolutely necessary for control flow
-    - Filtered operations on GPU before any CPU transfer
-    - Optional input validation that can be disabled for production performance
-
-    Sync Optimization Strategy:
-    - Keep all computations on GPU as long as possible
-    - Batch multiple CPU transfers into single operations
-    - Filter invalid/empty data on GPU before CPU transfer
-    - Use GPU tensor operations instead of CPU iteration where possible
-    - Only sync when control flow decisions are absolutely required
     """
 
     # Constants for Blackwell architecture support
@@ -94,9 +104,9 @@ def __init__(
         custom_activation,
         use_2cta_instrs=True,
         mma_tiler_mn=(256, 128),
-        cluster_shape_mn=(4, 4),
-        validate=True,
-        debug_shapes=True,
+        cluster_shape_mn=(2, 2),
+        validate=False,
+        debug_shapes=False,
     ):
         """
         Initialize the CUTLASS grouped GEMM strategy for Blackwell architecture.
@@ -112,7 +122,7 @@ def __init__(
         super().__init__(custom_activation)
         self.use_2cta_instrs = use_2cta_instrs
         self.validate = validate
-        self.debug_shapes = True  # debug_shapes
+        self.debug_shapes = debug_shapes
 
         # Set configuration
         self.mma_tiler_mn = mma_tiler_mn or self._get_default_mma_tiler()
@@ -212,7 +222,7 @@ def _log_initialization(self):
         logger.info(f"  - MMA tiler (M, N): {self.mma_tiler_mn}")
         logger.info(f"  - Cluster shape (M, N): {self.cluster_shape_mn}")
         logger.info(f"  - Cluster size: {cluster_size}")
-        logger.info(f"  - Pre-transposed weights: Enabled")
+        logger.info(f"  - Weight format: Standard PyTorch (runtime transpose)")
         logger.info(
             f"  - Input validation: {'Enabled' if self.validate else 'Disabled'}"
         )
@@ -232,30 +242,35 @@ def _debug_log_shapes(self, message, **tensors):
                     shape_info.append(f"{name}: {tensor.shape}")
                 else:
                     shape_info.append(f"{name}: {type(tensor)}")
-            logger.info(f"[SHAPE DEBUG] {message} - {', '.join(shape_info)}")
+            logger.debug(f"[SHAPE DEBUG] {message} - {', '.join(shape_info)}")
 
     def arrange_expert_weights(self, all_weights, submod_name, module):
-        """Store weights in pre-transposed stacked format for optimal CUTLASS performance."""
-        # Pre-transpose weights from [out_dim, in_dim] to [in_dim, out_dim]
-        # This eliminates the need for transpose operations during execution
-
-        # Note: Different projections have different original shapes:
-        # - gate/up: [intermediate_size, hidden_size] -> transpose to [hidden_size, intermediate_size]
-        # - down: [hidden_size, intermediate_size] -> transpose to [intermediate_size, hidden_size]
-        # for w in all_weights:
-        #    print(f"weight shape: {w.shape}, ")
-        # transposed_weights = [w.t().contiguous() for w in all_weights]
-        return torch.stack(all_weights)
+        """Store weights in stacked format (NO transpose - keep original PyTorch format)"""
+        # Keep original weight format for compatibility:
+        # gate/up: [intermediate_size, hidden_size]
+        # down: [hidden_size, intermediate_size]
+
+        # DEBUG: Print shapes to verify no transpose
+        print(f"[arrange_expert_weights] Processing {submod_name}")
+        for i, w in enumerate(all_weights):
+            print(f"[arrange_expert_weights] {submod_name} expert {i}: {w.shape}")
+
+        # NO TRANSPOSE - just stack the original weights
+        stacked = torch.stack(all_weights)
+        print(
+            f"[arrange_expert_weights] {submod_name} final stacked shape: {stacked.shape}"
+        )
+        return stacked
 
     def execute(self, contig_tokens, m_sizes, m_offsets, module):
         """
-        Execute using CUTLASS grouped GEMM kernel with pre-transposed weights.
+        Execute using CUTLASS grouped GEMM kernel with standard PyTorch weight format.
 
         Args:
             contig_tokens: Input tokens arranged contiguously by expert
             m_sizes: Tensor of expert sizes (GPU tensor to avoid sync)
             m_offsets: Tensor of expert offsets (GPU tensor to avoid sync)
-            module: MoE module containing pre-transposed weights
+            module: MoE module containing weights in standard PyTorch format
         """
         # Convert to GPU tensors if needed (avoid CPU-GPU sync)
         m_sizes_gpu, m_offsets_gpu = self._ensure_gpu_tensors(
@@ -266,23 +281,24 @@ def execute(self, contig_tokens, m_sizes, m_offsets, module):
         if self.validate:
             self._validate_inputs(contig_tokens, m_sizes_gpu, module)
 
-        # Get pre-transposed weights and device
+        # Get weights and device
         weights = self._get_weights(module)
         device = contig_tokens.device
 
-        # Debug logging
-        self._debug_log_shapes(
-            "Input tensors",
-            contig_tokens=contig_tokens,
-            gate_weights=weights["gate"],
-            up_weights=weights["up"],
-            down_weights=weights["down"],
-        )
-
-        # Prepare output tensor
+        # Debug logging - force print for visibility
+        if self.debug_shapes:
+            print(f"[DEBUG] Input tensors - contig_tokens: {contig_tokens.shape}")
+            print(f"[DEBUG] Gate weights: {weights['gate'].shape}")
+            print(f"[DEBUG] Up weights: {weights['up'].shape}")
+            print(f"[DEBUG] Down weights: {weights['down'].shape}")
+            print(f"[DEBUG] m_sizes_gpu: {m_sizes_gpu}")
+            print(f"[DEBUG] m_offsets_gpu: {m_offsets_gpu}")
+
+        # Prepare output tensor - use down projection weight shape for final output size
+        # Down weights are [num_experts, hidden_size, intermediate_size], so output is hidden_size
         output = torch.zeros(
             contig_tokens.shape[0],
-            weights["gate"].shape[2],  # output dimension is now last dimension
+            weights["down"].shape[1],  # hidden_size from down projection
             dtype=self.DTYPE_TORCH,
             device=device,
         )
@@ -294,7 +310,7 @@ def execute(self, contig_tokens, m_sizes, m_offsets, module):
         if not has_valid_experts.item():
             return output
 
-        # Execute the three-stage computation using GPU-only operations with pre-transposed weights
+        # Execute the three-stage computation using GPU-only operations
         gate_outputs, up_outputs = self._execute_projections_gpu(
             contig_tokens,
             weights["gate"],
@@ -363,22 +379,24 @@ def _validate_inputs(self, contig_tokens, m_sizes_gpu, module):
             if not hasattr(module, param) or module.get_parameter(param) is None:
                 raise ValueError(f"Module missing required parameter: {param}")
 
-        # Note: Avoid checking tensor values or sizes that would require GPU sync
-
     def _get_weights(self, module):
-        """Extract and return pre-transposed weight tensors from module."""
+        """Extract and return weight tensors from module (original format, not transposed)."""
         return {
             "gate": module.get_parameter(
                 "gate_proj_weight"
-            ),  # [num_experts, in_dim, out_dim]
-            "up": module.get_parameter("up_proj_weight"),
-            "down": module.get_parameter("down_proj_weight"),
+            ),  # [num_experts, intermediate_size, hidden_size]
+            "up": module.get_parameter(
+                "up_proj_weight"
+            ),  # [num_experts, intermediate_size, hidden_size]
+            "down": module.get_parameter(
+                "down_proj_weight"
+            ),  # [num_experts, hidden_size, intermediate_size]
         }
 
     def _execute_projections_gpu(
         self, input_tokens, weight1, weight2, m_sizes_gpu, m_offsets_gpu, device
     ):
-        """Execute gate and up projections using GPU-only operations with pre-transposed weights."""
+        """Execute gate and up projections using GPU-only operations."""
         # Find valid experts using GPU operations
         valid_mask = m_sizes_gpu > 0
         valid_indices = torch.nonzero(valid_mask, as_tuple=True)[0]
@@ -458,13 +476,33 @@ def _prepare_gate_up_metadata_gpu(
         for expert_idx, size, offset in zip(indices_list, sizes_list, offsets_list):
             # Get expert data
             expert_tokens = input_tokens[offset : offset + size].contiguous()
-            # Pre-transposed weights: [in_dim, out_dim] - no transpose needed
-            gate_weight = gate_weights[expert_idx].contiguous()
-            up_weight = up_weights[expert_idx].contiguous()
-
-            M, K = expert_tokens.shape
-            N = gate_weight.shape[1]  # output dimension is now second dimension
-            L = 1
+            # Original weight format: gate/up are [intermediate_size, hidden_size]
+            # Need to transpose for matrix multiplication: tokens @ weight.t()
+            gate_weight = (
+                gate_weights[expert_idx].t().contiguous()
+            )  # [hidden_size, intermediate_size]
+            up_weight = (
+                up_weights[expert_idx].t().contiguous()
+            )  # [hidden_size, intermediate_size]
+
+            if self.debug_shapes:
+                print(f"[DEBUG] Gate/Up projection expert {expert_idx}")
+                print(f"  - expert_tokens: {expert_tokens.shape}")
+                print(f"  - gate_weight (after .t()): {gate_weight.shape}")
+                print(f"  - up_weight (after .t()): {up_weight.shape}")
+
+            M, K = expert_tokens.shape  # M = batch_size, K = hidden_size
+            K_weight, N = (
+                gate_weight.shape
+            )  # K_weight = hidden_size, N = intermediate_size
+
+            # Verify dimension compatibility
+            if K != K_weight:
+                raise ValueError(
+                    f"Dimension mismatch in gate/up projections: "
+                    f"input tokens have {K} features but weight expects {K_weight}. "
+                    f"Tokens shape: {expert_tokens.shape}, Gate weight shape: {gate_weight.shape}"
+                )
 
             # Create output tensors
             gate_output = torch.empty(M, N, dtype=self.DTYPE_TORCH, device=device)
@@ -490,7 +528,7 @@ def _prepare_gate_up_metadata_gpu(
     def _execute_down_projection_gpu(
         self, hidden_states, down_weights, m_sizes_gpu, device
     ):
-        """Execute down projection using GPU operations with pre-transposed weights."""
+        """Execute down projection using GPU operations."""
         if not hidden_states:
             return []
 
@@ -536,15 +574,16 @@ def _prepare_down_metadata_gpu(
         for i, expert_idx in enumerate(valid_indices_cpu):
             if i < num_hidden_states:
                 hidden = hidden_states[i]
-                # Pre-transposed down weights: original [hidden_size, intermediate_size] -> [intermediate_size, hidden_size]
-                down_weight = down_weights[expert_idx].contiguous()
-
-                # Debug logging
-                self._debug_log_shapes(
-                    f"Down projection expert {expert_idx}",
-                    hidden=hidden,
-                    down_weight=down_weight,
-                )
+                # Original down weight format: [hidden_size, intermediate_size]
+                # Need to transpose for matrix multiplication: hidden @ weight.t()
+                down_weight = (
+                    down_weights[expert_idx].t().contiguous()
+                )  # [intermediate_size, hidden_size]
+
+                if self.debug_shapes:
+                    print(f"[DEBUG] Down projection expert {expert_idx}")
+                    print(f"  - hidden: {hidden.shape}")
+                    print(f"  - down_weight (after .t()): {down_weight.shape}")
 
                 M, K = hidden.shape  # M = batch_size, K = intermediate_size
                 K_weight, N = (
@@ -584,13 +623,19 @@ def _add_projection_to_metadata(
         strides_abc,
         ptrs_abc,
     ):
-        """Add a single projection to the metadata lists (assumes pre-transposed weights)."""
+        """Add a single projection to the metadata lists (weights are transposed at call site)."""
         M, K = input_tensor.shape  # M = batch_size, K = input_features
         K_weight, N = (
             weight_tensor.shape
         )  # K_weight = input_features, N = output_features
         L = 1
 
+        # Debug print
+        if self.debug_shapes:
+            print(
+                f"[DEBUG] Matrix mult: input {input_tensor.shape} @ weight {weight_tensor.shape} -> output {output_tensor.shape}"
+            )
+
         # Verify dimension compatibility for matrix multiplication
         if K != K_weight:
             raise ValueError(
@@ -693,7 +738,7 @@ def _get_compiled_kernel(
 
         if cache_key not in self._compiled_kernels:
             logger.info(
-                f"Compiling CUTLASS grouped GEMM kernel: {num_groups} groups, 2CTA={self.use_2cta_instrs}, cluster={self.cluster_shape_mn}, Pre-transposed weights"
+                f"Compiling CUTLASS grouped GEMM kernel: {num_groups} groups, 2CTA={self.use_2cta_instrs}, cluster={self.cluster_shape_mn}"
             )
 
             self._compiled_kernels[cache_key] = cute.compile(
@@ -716,12 +761,12 @@ def _create_initial_tensors(self, problem_shape, device):
         """Create initial CUTE tensors for kernel compilation."""
         M, N, K, L = problem_shape
 
-        # Create tensors with pre-transposed weight layout
+        # Create tensors with standard PyTorch layout (weights will be transposed at runtime)
         tensors = [
             torch.randn(M, K, dtype=self.DTYPE_TORCH, device=device),  # A (input)
             torch.randn(
                 K, N, dtype=self.DTYPE_TORCH, device=device
-            ),  # B (pre-transposed weight)
+            ),  # B (transposed weight)
             torch.zeros(M, N, dtype=self.DTYPE_TORCH, device=device),  # C (output)
         ]
 

From 47d614c2abc07aa61a8fe19d41fb0e6f082b9054 Mon Sep 17 00:00:00 2001
From: lessw2020 <lessw@etrillium.com>
Date: Sat, 21 Jun 2025 21:57:03 -0700
Subject: [PATCH 26/34] integrate cute kernel cache options

---
 torchtitan/experiments/deepseek_v3/cutlass_grouped_gemm.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/torchtitan/experiments/deepseek_v3/cutlass_grouped_gemm.py b/torchtitan/experiments/deepseek_v3/cutlass_grouped_gemm.py
index 470b730cc..2078620c0 100644
--- a/torchtitan/experiments/deepseek_v3/cutlass_grouped_gemm.py
+++ b/torchtitan/experiments/deepseek_v3/cutlass_grouped_gemm.py
@@ -41,6 +41,12 @@
 
 """
 
+# Disable file caching while keeping in-memory cache available, defaults to False.
+# export CUTE_DSL_DISABLE_FILE_CACHING=True
+
+# Maximum number of cache files allowed, defaults to 1000.
+# export CUTE_DSL_FILE_CACHING_CAPACITY=1000
+
 import logging
 
 import torch

From 00b90c1317f8251e8bfd3ca2cff3625e6099ecbb Mon Sep 17 00:00:00 2001
From: lessw2020 <lessw@etrillium.com>
Date: Sat, 21 Jun 2025 22:00:32 -0700
Subject: [PATCH 27/34] move working version to standlone

---
 .../deepseek_v3/cutlass_grouped_gemm.py       | 639 +++++++++++++++++-
 .../experiments/deepseek_v3/group_gemms.py    | 637 -----------------
 2 files changed, 638 insertions(+), 638 deletions(-)

diff --git a/torchtitan/experiments/deepseek_v3/cutlass_grouped_gemm.py b/torchtitan/experiments/deepseek_v3/cutlass_grouped_gemm.py
index 2078620c0..bd2d540c9 100644
--- a/torchtitan/experiments/deepseek_v3/cutlass_grouped_gemm.py
+++ b/torchtitan/experiments/deepseek_v3/cutlass_grouped_gemm.py
@@ -75,7 +75,644 @@
 logger = logging.getLogger(__name__)
 
 
-class CUTLASSGroupedGemmStrategy(GroupGEMMStrategy):
+class CUTLASSGroupedGemmStrategy_prev(GroupGEMMStrategy):
+    """
+    Strategy using CUTLASS GroupedGemmKernel for group GEMM operations on Blackwell architecture.
+
+    This version eliminates CPU-GPU synchronization by keeping all size/offset computations on GPU.
+    """
+
+    # Constants (same as before)
+    SUPPORTED_CLUSTER_SHAPES = [
+        (1, 1),
+        (1, 2),
+        (1, 4),
+        (2, 1),
+        (2, 2),
+        (2, 4),
+        (4, 1),
+        (4, 2),
+        (4, 4),
+    ]
+
+    SINGLE_CTA_M_SIZES = [128, 64]
+    DUAL_CTA_M_SIZES = [256, 128]
+    N_SIZE_RANGE = range(32, 257, 32)  # 32 - 256, step 32
+
+    DTYPE_TORCH = torch.bfloat16
+    DTYPE_CUTLASS = cutlass.BFloat16
+    ACC_DTYPE = cutlass.Float32
+    ALIGNMENT = 16
+    TENSORMAP_COUNT = 3
+    TENSORMAP_BYTES = 128
+
+    def __init__(
+        self,
+        custom_activation,
+        use_2cta_instrs=True,
+        mma_tiler_mn=(256, 128),
+        cluster_shape_mn=(4, 4),
+    ):
+        """Initialize the CUTLASS grouped GEMM strategy for Blackwell architecture."""
+        super().__init__(custom_activation)
+        self.use_2cta_instrs = use_2cta_instrs
+
+        # Set configuration
+        self.mma_tiler_mn = mma_tiler_mn or self._get_default_mma_tiler()
+        self.cluster_shape_mn = cluster_shape_mn or self._get_default_cluster_shape()
+
+        # Validate configurations
+        # self._validate_configurations()
+
+        # Initialize kernel and hardware info
+        self._initialize_kernel()
+        self._initialize_hardware()
+
+        # Initialize caches
+        self._compiled_kernels = {}
+        self._tensormap_buffers = {}
+
+        self._log_initialization()
+
+    def _get_default_mma_tiler(self):
+        """Get default MMA tiler configuration based on CTA mode."""
+        return (256, 128) if self.use_2cta_instrs else (128, 128)
+
+    def _get_default_cluster_shape(self):
+        """Get default cluster shape based on CTA mode."""
+        return (2, 2) if self.use_2cta_instrs else (1, 1)
+
+    def _initialize_kernel(self):
+        """Initialize the CUTLASS grouped GEMM kernel."""
+        self.grouped_gemm = GroupedGemmKernel(
+            acc_dtype=self.ACC_DTYPE,
+            use_2cta_instrs=self.use_2cta_instrs,
+            mma_tiler_mn=self.mma_tiler_mn,
+            cluster_shape_mn=self.cluster_shape_mn,
+            tensormap_update_mode=utils.TensorMapUpdateMode.SMEM,
+        )
+
+    def _initialize_hardware(self):
+        """Initialize hardware information and stream."""
+        self.hardware_info = utils.HardwareInfo()
+        self.max_active_clusters = self.hardware_info.get_max_active_clusters(
+            self.cluster_shape_mn[0] * self.cluster_shape_mn[1]
+        )
+
+        torch_stream = torch.cuda.current_stream()
+        self.stream = cuda.CUstream(torch_stream.cuda_stream)
+
+    def _validate_configurations(self):
+        """Validate configurations for Blackwell."""
+        self._validate_mma_tiler()
+        self._validate_cluster_shape()
+        self._validate_2cta_constraints()
+
+    def _validate_mma_tiler(self):
+        """Validate MMA tiler configuration."""
+        m_size, n_size = self.mma_tiler_mn
+
+        valid_m_sizes = (
+            self.DUAL_CTA_M_SIZES if self.use_2cta_instrs else self.SINGLE_CTA_M_SIZES
+        )
+        mode_name = "2 CTA" if self.use_2cta_instrs else "single CTA"
+
+        if m_size not in valid_m_sizes:
+            raise ValueError(
+                f"For {mode_name} mode on Blackwell, MMA tiler M must be in {valid_m_sizes}, got {m_size}"
+            )
+
+        if n_size not in self.N_SIZE_RANGE:
+            raise ValueError(
+                f"MMA tiler N must be in range [32, 256] with step 32, got {n_size}"
+            )
+
+    def _validate_cluster_shape(self):
+        """Validate cluster shape configuration."""
+        if self.cluster_shape_mn not in self.SUPPORTED_CLUSTER_SHAPES:
+            raise ValueError(
+                f"Cluster shape {self.cluster_shape_mn} not supported on Blackwell. "
+                f"Valid cluster shapes are: {self.SUPPORTED_CLUSTER_SHAPES}"
+            )
+
+    def _validate_2cta_constraints(self):
+        """Validate 2 CTA specific constraints."""
+        if self.use_2cta_instrs and self.cluster_shape_mn[0] % 2 != 0:
+            valid_2cta_shapes = [
+                shape for shape in self.SUPPORTED_CLUSTER_SHAPES if shape[0] % 2 == 0
+            ]
+            raise ValueError(
+                f"For 2 CTA mode, cluster shape M must be even, got {self.cluster_shape_mn[0]}. "
+                f"Valid 2 CTA cluster shapes: {valid_2cta_shapes}"
+            )
+
+    def _log_initialization(self):
+        """Log initialization information."""
+        cluster_size = self.cluster_shape_mn[0] * self.cluster_shape_mn[1]
+        print(f"Initialized CUTLASSGroupedGemmStrategy for Blackwell with:")
+        print(f"  - 2 CTA instructions: {self.use_2cta_instrs}")
+        print(f"  - MMA tiler (M, N): {self.mma_tiler_mn}")
+        print(f"  - Cluster shape (M, N): {self.cluster_shape_mn}")
+        print(f"  - Cluster size: {cluster_size}")
+        if cluster_size > 1:
+            print(f"  - Using multi-CTA parallelism")
+
+    def arrange_expert_weights(self, all_weights, submod_name, module):
+        """Store weights in stacked format."""
+        return torch.stack(all_weights)
+
+    def execute(self, contig_tokens, m_sizes, m_offsets, module):
+        """
+        Execute using CUTLASS grouped GEMM kernel - GPU-only version.
+
+        Args:
+            contig_tokens: Input tokens arranged contiguously by expert
+            m_sizes: Tensor of expert sizes (GPU tensor to avoid sync)
+            m_offsets: Tensor of expert offsets (GPU tensor to avoid sync)
+            module: MoE module containing weights
+        """
+        # Convert to GPU tensors if needed (avoid CPU-GPU sync)
+        m_sizes_gpu, m_offsets_gpu = self._ensure_gpu_tensors(
+            m_sizes, m_offsets, contig_tokens.device
+        )
+
+        # Validate inputs
+        # self._validate_inputs(contig_tokens, m_sizes_gpu, module)
+
+        # Get weights and device
+        weights = self._get_weights(module)
+        device = contig_tokens.device
+
+        # Prepare output tensor
+        output = torch.zeros(
+            contig_tokens.shape[0],
+            weights["gate"].shape[2],
+            dtype=self.DTYPE_TORCH,
+            device=device,
+        )
+
+        # Check for valid experts using GPU operations (no sync)
+        if not self._has_valid_experts_gpu(m_sizes_gpu):
+            return output
+
+        # Execute the three-stage computation using GPU-only operations
+        gate_outputs, up_outputs = self._execute_projections_gpu(
+            contig_tokens,
+            weights["gate"],
+            weights["up"],
+            m_sizes_gpu,
+            m_offsets_gpu,
+            device,
+        )
+
+        hidden_states = self._apply_activation_and_combine(gate_outputs, up_outputs)
+
+        final_outputs = self._execute_down_projection_gpu(
+            hidden_states, weights["down"], m_sizes_gpu, device
+        )
+
+        return self._reconstruct_output_gpu(
+            final_outputs, m_sizes_gpu, m_offsets_gpu, output
+        )
+
+    def _ensure_gpu_tensors(self, m_sizes, m_offsets, device):
+        """Ensure sizes and offsets are GPU tensors to avoid CPU-GPU sync."""
+        if not isinstance(m_sizes, torch.Tensor):
+            m_sizes_gpu = torch.tensor(m_sizes, dtype=torch.int32, device=device)
+        else:
+            m_sizes_gpu = m_sizes.to(device=device, dtype=torch.int32)
+
+        if not isinstance(m_offsets, torch.Tensor):
+            m_offsets_gpu = torch.tensor(m_offsets, dtype=torch.int32, device=device)
+        else:
+            m_offsets_gpu = m_offsets.to(device=device, dtype=torch.int32)
+
+        return m_sizes_gpu, m_offsets_gpu
+
+    def _has_valid_experts_gpu(self, m_sizes_gpu):
+        """Check if any experts have tokens using GPU operations (no sync)."""
+        return torch.any(
+            m_sizes_gpu > 0
+        ).item()  # Single sync here is unavoidable for control flow
+
+    def _validate_inputs(self, contig_tokens, m_sizes_gpu, module):
+        """Validate input parameters."""
+        if contig_tokens.dtype != self.DTYPE_TORCH:
+            raise ValueError(
+                f"Expected input dtype {self.DTYPE_TORCH}, got {contig_tokens.dtype}"
+            )
+
+        if len(contig_tokens.shape) != 2:
+            raise ValueError(
+                f"Expected 2D input tensor, got shape {contig_tokens.shape}"
+            )
+
+        required_params = ["gate_proj_weight", "up_proj_weight", "down_proj_weight"]
+        for param in required_params:
+            if not hasattr(module, param) or module.get_parameter(param) is None:
+                raise ValueError(f"Module missing required parameter: {param}")
+
+    def _get_weights(self, module):
+        """Extract and return weight tensors from module."""
+        return {
+            "gate": module.get_parameter("gate_proj_weight"),
+            "up": module.get_parameter("up_proj_weight"),
+            "down": module.get_parameter("down_proj_weight"),
+        }
+
+    def _execute_projections_gpu(
+        self, input_tokens, weight1, weight2, m_sizes_gpu, m_offsets_gpu, device
+    ):
+        """Execute gate and up projections using GPU-only operations."""
+        # Find valid experts using GPU operations
+        valid_mask = m_sizes_gpu > 0
+        valid_indices = torch.nonzero(valid_mask, as_tuple=True)[0]
+
+        if len(valid_indices) == 0:
+            return [], []
+
+        # Prepare metadata in batch using GPU operations
+        problem_sizes, strides_abc, ptrs_abc, gate_outputs, up_outputs = (
+            self._prepare_gate_up_metadata_gpu(
+                input_tokens,
+                weight1,
+                weight2,
+                m_sizes_gpu,
+                m_offsets_gpu,
+                valid_indices,
+                device,
+            )
+        )
+
+        if len(problem_sizes) == 0:
+            return [], []
+
+        # Execute grouped GEMM
+        self._execute_grouped_gemm(problem_sizes, strides_abc, ptrs_abc, device)
+
+        return gate_outputs, up_outputs
+
+    def _prepare_gate_up_metadata_gpu(
+        self,
+        input_tokens,
+        gate_weights,
+        up_weights,
+        m_sizes_gpu,
+        m_offsets_gpu,
+        valid_indices,
+        device,
+    ):
+        """Prepare metadata for gate and up projections"""
+        problem_sizes = []
+        strides_abc = []
+        ptrs_abc = []
+        gate_outputs = []
+        up_outputs = []
+
+        # Extract valid sizes and offsets (minimal sync - only for valid experts)
+        valid_sizes = m_sizes_gpu[valid_indices]
+        valid_offsets = (
+            m_offsets_gpu[valid_indices]
+            if len(m_offsets_gpu) > len(valid_indices)
+            else torch.cumsum(
+                torch.cat([torch.tensor([0], device=device), valid_sizes[:-1]]), dim=0
+            )
+        )
+
+        # Convert to Python for iteration (unavoidable in this test for metadata preparation)
+        valid_sizes_cpu = valid_sizes.cpu().tolist()
+        valid_offsets_cpu = valid_offsets.cpu().tolist()
+        valid_indices_cpu = valid_indices.cpu().tolist()
+
+        for i, (expert_idx, size, offset) in enumerate(
+            zip(valid_indices_cpu, valid_sizes_cpu, valid_offsets_cpu)
+        ):
+            if size > 0:
+                # Get expert data
+                expert_tokens = input_tokens[offset : offset + size].contiguous()
+                gate_weight = gate_weights[expert_idx].contiguous()
+                up_weight = up_weights[expert_idx].contiguous()
+
+                M, K = expert_tokens.shape
+                N = gate_weight.shape[0]
+                L = 1
+
+                # Create output tensors
+                gate_output = torch.empty(M, N, dtype=self.DTYPE_TORCH, device=device)
+                up_output = torch.empty(M, N, dtype=self.DTYPE_TORCH, device=device)
+
+                # Add both projections to metadata
+                for weight, output, output_list in [
+                    (gate_weight, gate_output, gate_outputs),
+                    (up_weight, up_output, up_outputs),
+                ]:
+                    self._add_projection_to_metadata(
+                        expert_tokens,
+                        weight,
+                        output,
+                        problem_sizes,
+                        strides_abc,
+                        ptrs_abc,
+                    )
+                    output_list.append(output)
+
+        return problem_sizes, strides_abc, ptrs_abc, gate_outputs, up_outputs
+
+    def _execute_down_projection_gpu(
+        self, hidden_states, down_weights, m_sizes_gpu, device
+    ):
+        """Execute down projection using GPU operations."""
+        if not hidden_states:
+            return []
+
+        # Find valid experts
+        valid_mask = m_sizes_gpu > 0
+        valid_indices = torch.nonzero(valid_mask, as_tuple=True)[0]
+
+        # Prepare metadata
+        problem_sizes, strides_abc, ptrs_abc, down_outputs = (
+            self._prepare_down_metadata_gpu(
+                hidden_states, down_weights, valid_indices, device
+            )
+        )
+
+        if len(problem_sizes) == 0:
+            return []
+
+        # Execute grouped GEMM
+        self._execute_grouped_gemm(problem_sizes, strides_abc, ptrs_abc, device)
+
+        return down_outputs
+
+    def _prepare_down_metadata_gpu(
+        self, hidden_states, down_weights, valid_indices, device
+    ):
+        """Prepare metadata for down projection using GPU operations."""
+        problem_sizes = []
+        strides_abc = []
+        ptrs_abc = []
+        down_outputs = []
+
+        # Convert indices to CPU for iteration (minimal sync)
+        valid_indices_cpu = valid_indices.cpu().tolist()
+
+        for i, expert_idx in enumerate(valid_indices_cpu):
+            if i < len(hidden_states):
+                hidden = hidden_states[i]
+                down_weight = down_weights[expert_idx].contiguous()
+
+                M, K = hidden.shape
+                N = down_weight.shape[0]
+
+                # Create output tensor
+                down_output = torch.empty(M, N, dtype=self.DTYPE_TORCH, device=device)
+                down_outputs.append(down_output)
+
+                # Add to metadata
+                self._add_projection_to_metadata(
+                    hidden,
+                    down_weight,
+                    down_output,
+                    problem_sizes,
+                    strides_abc,
+                    ptrs_abc,
+                )
+
+        return problem_sizes, strides_abc, ptrs_abc, down_outputs
+
+    def _add_projection_to_metadata(
+        self,
+        input_tensor,
+        weight_tensor,
+        output_tensor,
+        problem_sizes,
+        strides_abc,
+        ptrs_abc,
+    ):
+        """Add a single projection to the metadata lists."""
+        M, K = input_tensor.shape
+        N = weight_tensor.shape[0]
+        L = 1
+
+        # Convert to MNKL format
+        input_mnkl = input_tensor.unsqueeze(-1).contiguous()
+        weight_mnkl = weight_tensor.unsqueeze(-1).contiguous()
+        output_mnkl = output_tensor.unsqueeze(-1).contiguous()
+
+        # Extract strides
+        input_strides = list(input_mnkl.stride()[:2])
+        weight_strides = list(weight_mnkl.stride()[:2])
+        output_strides = list(output_mnkl.stride()[:2])
+
+        # Add to metadata
+        problem_sizes.append([M, N, K, L])
+        strides_abc.append([input_strides, weight_strides, output_strides])
+        ptrs_abc.append(
+            [
+                input_tensor.data_ptr(),
+                weight_tensor.data_ptr(),
+                output_tensor.data_ptr(),
+            ]
+        )
+
+    def _execute_grouped_gemm(self, problem_sizes, strides_abc, ptrs_abc, device):
+        """Execute the grouped GEMM kernel."""
+        num_groups = len(problem_sizes)
+
+        # Convert to CUTE tensors
+        problem_sizes_cute, strides_cute, ptrs_cute = self._convert_to_cute_tensors(
+            problem_sizes, strides_abc, ptrs_abc, device
+        )
+
+        # Get tensormap and compute clusters
+        tensormap_cute = self._get_tensormap_buffer(device)
+        total_clusters = self._compute_total_clusters(problem_sizes)
+
+        # Get initial tensors for compilation
+        initial_tensors = self._create_initial_tensors(problem_sizes[0], device)
+
+        # Compile or retrieve kernel
+        compiled_kernel = self._get_compiled_kernel(
+            num_groups,
+            total_clusters,
+            initial_tensors,
+            problem_sizes_cute,
+            strides_cute,
+            ptrs_cute,
+            tensormap_cute,
+        )
+
+        # Execute kernel
+        compiled_kernel(
+            *initial_tensors,
+            problem_sizes_cute,
+            strides_cute,
+            ptrs_cute,
+            tensormap_cute,
+            self.stream,
+        )
+        torch.cuda.synchronize()
+
+    def _convert_to_cute_tensors(self, problem_sizes, strides_abc, ptrs_abc, device):
+        """Convert metadata to CUTE tensors."""
+        problem_sizes_tensor = torch.tensor(
+            problem_sizes, dtype=torch.int32, device=device
+        )
+        strides_tensor = torch.tensor(strides_abc, dtype=torch.int32, device=device)
+        ptrs_tensor = torch.tensor(ptrs_abc, dtype=torch.int64, device=device)
+
+        return (
+            from_dlpack(problem_sizes_tensor, assumed_align=self.ALIGNMENT),
+            from_dlpack(strides_tensor, assumed_align=self.ALIGNMENT),
+            from_dlpack(ptrs_tensor, assumed_align=self.ALIGNMENT),
+        )
+
+    def _get_compiled_kernel(
+        self,
+        num_groups,
+        total_clusters,
+        initial_tensors,
+        problem_sizes_cute,
+        strides_cute,
+        ptrs_cute,
+        tensormap_cute,
+    ):
+        """Get or compile the grouped GEMM kernel."""
+        cache_key = (
+            num_groups,
+            total_clusters,
+            self.use_2cta_instrs,
+            self.mma_tiler_mn,
+            self.cluster_shape_mn,
+        )
+
+        if cache_key not in self._compiled_kernels:
+            print(
+                f"Compiling CUTLASS grouped GEMM kernel: {num_groups} groups, 2CTA={self.use_2cta_instrs}, cluster={self.cluster_shape_mn}"
+            )
+
+            self._compiled_kernels[cache_key] = cute.compile(
+                self.grouped_gemm,
+                *initial_tensors,
+                num_groups,
+                problem_sizes_cute,
+                strides_cute,
+                ptrs_cute,
+                total_clusters,
+                tensormap_cute,
+                self.max_active_clusters,
+                self.stream,
+            )
+            print("Kernel compilation successful")
+
+        return self._compiled_kernels[cache_key]
+
+    def _create_initial_tensors(self, problem_shape, device):
+        """Create initial CUTE tensors for kernel compilation."""
+        M, N, K, L = problem_shape
+
+        # Create tensors
+        tensors = [
+            torch.randn(M, K, dtype=self.DTYPE_TORCH, device=device),  # A
+            torch.randn(N, K, dtype=self.DTYPE_TORCH, device=device),  # B
+            torch.zeros(M, N, dtype=self.DTYPE_TORCH, device=device),  # C
+        ]
+
+        # Convert to MNKL format and create CUTE tensors
+        cute_tensors = []
+        for tensor in tensors:
+            mnkl_tensor = tensor.unsqueeze(-1).contiguous()
+            cute_tensor = from_dlpack(mnkl_tensor, assumed_align=self.ALIGNMENT)
+            cute_tensor.element_type = self.DTYPE_CUTLASS
+            cute_tensor = cute_tensor.mark_layout_dynamic(leading_dim=1)
+            cute_tensors.append(cute_tensor)
+
+        return cute_tensors
+
+    def _get_tensormap_buffer(self, device):
+        """Get or create tensormap buffer."""
+        if device not in self._tensormap_buffers:
+            sm_count = self.hardware_info.get_max_active_clusters(1)
+            tensormap_tensor = torch.zeros(
+                (sm_count, self.TENSORMAP_COUNT, self.TENSORMAP_BYTES // 8),
+                dtype=torch.int64,
+                device=device,
+            )
+            self._tensormap_buffers[device] = from_dlpack(
+                tensormap_tensor, assumed_align=self.ALIGNMENT
+            )
+
+        return self._tensormap_buffers[device]
+
+    def _compute_total_clusters(self, problem_sizes):
+        """Compute total number of clusters needed."""
+        cluster_tile_m = self.mma_tiler_mn[0]
+        cluster_tile_n = self.mma_tiler_mn[1]
+
+        # Adjust for 2 CTA mode and cluster shape
+        if self.use_2cta_instrs:
+            cluster_tile_m //= 2
+
+        cluster_tile_m *= self.cluster_shape_mn[0]
+        cluster_tile_n *= self.cluster_shape_mn[1]
+
+        total = 0
+        for M, N, K, L in problem_sizes:
+            clusters_m = (M + cluster_tile_m - 1) // cluster_tile_m
+            clusters_n = (N + cluster_tile_n - 1) // cluster_tile_n
+            total += clusters_m * clusters_n
+
+        return total
+
+    def _apply_activation_and_combine(self, gate_outputs, up_outputs):
+        """Apply activation and combine gate/up outputs."""
+        return [
+            self.activation_function(gate_out) * up_out
+            for gate_out, up_out in zip(gate_outputs, up_outputs)
+        ]
+
+    def _reconstruct_output_gpu(
+        self, final_outputs, m_sizes_gpu, m_offsets_gpu, output
+    ):
+        """Reconstruct the full output tensor using GPU operations (minimal sync)."""
+        if not final_outputs:
+            return output
+
+        # Find valid experts
+        valid_mask = m_sizes_gpu > 0
+        valid_indices = torch.nonzero(valid_mask, as_tuple=True)[0]
+        valid_sizes = m_sizes_gpu[valid_indices]
+
+        # Compute offsets if not provided properly
+        if len(m_offsets_gpu) <= len(valid_indices):
+            valid_offsets = torch.cumsum(
+                torch.cat(
+                    [torch.tensor([0], device=m_sizes_gpu.device), valid_sizes[:-1]]
+                ),
+                dim=0,
+            )
+        else:
+            valid_offsets = m_offsets_gpu[valid_indices]
+
+        # Convert to CPU for final reconstruction (minimal sync)
+        valid_sizes_cpu = valid_sizes.cpu().tolist()
+        valid_offsets_cpu = valid_offsets.cpu().tolist()
+
+        for i, (size, offset) in enumerate(zip(valid_sizes_cpu, valid_offsets_cpu)):
+            if i < len(final_outputs):
+                output[offset : offset + size] = final_outputs[i]
+
+        return output
+
+    @staticmethod
+    def is_available() -> bool:
+        return HAS_CUTLASS
+
+
+# ========================= end of CUTLASSGroupedGemmStrategy =========================
+
+
+class CUTLASSGroupedGemmStrategy_incorrect(GroupGEMMStrategy):
     """
     Strategy using CUTLASS GroupedGemmKernel for group GEMM operations on Blackwell architecture.
 
diff --git a/torchtitan/experiments/deepseek_v3/group_gemms.py b/torchtitan/experiments/deepseek_v3/group_gemms.py
index 41344ee5b..abaf5d907 100644
--- a/torchtitan/experiments/deepseek_v3/group_gemms.py
+++ b/torchtitan/experiments/deepseek_v3/group_gemms.py
@@ -853,643 +853,6 @@ def is_available() -> bool:
 # ========================= end of CUTLASSGroupedGemmStrategy =========================
 
 
-class CUTLASSGroupedGemmStrategy_prev(GroupGEMMStrategy):
-    """
-    Strategy using CUTLASS GroupedGemmKernel for group GEMM operations on Blackwell architecture.
-
-    This version eliminates CPU-GPU synchronization by keeping all size/offset computations on GPU.
-    """
-
-    # Constants (same as before)
-    SUPPORTED_CLUSTER_SHAPES = [
-        (1, 1),
-        (1, 2),
-        (1, 4),
-        (2, 1),
-        (2, 2),
-        (2, 4),
-        (4, 1),
-        (4, 2),
-        (4, 4),
-    ]
-
-    SINGLE_CTA_M_SIZES = [128, 64]
-    DUAL_CTA_M_SIZES = [256, 128]
-    N_SIZE_RANGE = range(32, 257, 32)  # 32 - 256, step 32
-
-    DTYPE_TORCH = torch.bfloat16
-    DTYPE_CUTLASS = cutlass.BFloat16
-    ACC_DTYPE = cutlass.Float32
-    ALIGNMENT = 16
-    TENSORMAP_COUNT = 3
-    TENSORMAP_BYTES = 128
-
-    def __init__(
-        self,
-        custom_activation,
-        use_2cta_instrs=True,
-        mma_tiler_mn=(256, 128),
-        cluster_shape_mn=(4, 4),
-    ):
-        """Initialize the CUTLASS grouped GEMM strategy for Blackwell architecture."""
-        super().__init__(custom_activation)
-        self.use_2cta_instrs = use_2cta_instrs
-
-        # Set configuration
-        self.mma_tiler_mn = mma_tiler_mn or self._get_default_mma_tiler()
-        self.cluster_shape_mn = cluster_shape_mn or self._get_default_cluster_shape()
-
-        # Validate configurations
-        # self._validate_configurations()
-
-        # Initialize kernel and hardware info
-        self._initialize_kernel()
-        self._initialize_hardware()
-
-        # Initialize caches
-        self._compiled_kernels = {}
-        self._tensormap_buffers = {}
-
-        self._log_initialization()
-
-    def _get_default_mma_tiler(self):
-        """Get default MMA tiler configuration based on CTA mode."""
-        return (256, 128) if self.use_2cta_instrs else (128, 128)
-
-    def _get_default_cluster_shape(self):
-        """Get default cluster shape based on CTA mode."""
-        return (2, 2) if self.use_2cta_instrs else (1, 1)
-
-    def _initialize_kernel(self):
-        """Initialize the CUTLASS grouped GEMM kernel."""
-        self.grouped_gemm = GroupedGemmKernel(
-            acc_dtype=self.ACC_DTYPE,
-            use_2cta_instrs=self.use_2cta_instrs,
-            mma_tiler_mn=self.mma_tiler_mn,
-            cluster_shape_mn=self.cluster_shape_mn,
-            tensormap_update_mode=utils.TensorMapUpdateMode.SMEM,
-        )
-
-    def _initialize_hardware(self):
-        """Initialize hardware information and stream."""
-        self.hardware_info = utils.HardwareInfo()
-        self.max_active_clusters = self.hardware_info.get_max_active_clusters(
-            self.cluster_shape_mn[0] * self.cluster_shape_mn[1]
-        )
-
-        torch_stream = torch.cuda.current_stream()
-        self.stream = cuda.CUstream(torch_stream.cuda_stream)
-
-    def _validate_configurations(self):
-        """Validate configurations for Blackwell."""
-        self._validate_mma_tiler()
-        self._validate_cluster_shape()
-        self._validate_2cta_constraints()
-
-    def _validate_mma_tiler(self):
-        """Validate MMA tiler configuration."""
-        m_size, n_size = self.mma_tiler_mn
-
-        valid_m_sizes = (
-            self.DUAL_CTA_M_SIZES if self.use_2cta_instrs else self.SINGLE_CTA_M_SIZES
-        )
-        mode_name = "2 CTA" if self.use_2cta_instrs else "single CTA"
-
-        if m_size not in valid_m_sizes:
-            raise ValueError(
-                f"For {mode_name} mode on Blackwell, MMA tiler M must be in {valid_m_sizes}, got {m_size}"
-            )
-
-        if n_size not in self.N_SIZE_RANGE:
-            raise ValueError(
-                f"MMA tiler N must be in range [32, 256] with step 32, got {n_size}"
-            )
-
-    def _validate_cluster_shape(self):
-        """Validate cluster shape configuration."""
-        if self.cluster_shape_mn not in self.SUPPORTED_CLUSTER_SHAPES:
-            raise ValueError(
-                f"Cluster shape {self.cluster_shape_mn} not supported on Blackwell. "
-                f"Valid cluster shapes are: {self.SUPPORTED_CLUSTER_SHAPES}"
-            )
-
-    def _validate_2cta_constraints(self):
-        """Validate 2 CTA specific constraints."""
-        if self.use_2cta_instrs and self.cluster_shape_mn[0] % 2 != 0:
-            valid_2cta_shapes = [
-                shape for shape in self.SUPPORTED_CLUSTER_SHAPES if shape[0] % 2 == 0
-            ]
-            raise ValueError(
-                f"For 2 CTA mode, cluster shape M must be even, got {self.cluster_shape_mn[0]}. "
-                f"Valid 2 CTA cluster shapes: {valid_2cta_shapes}"
-            )
-
-    def _log_initialization(self):
-        """Log initialization information."""
-        cluster_size = self.cluster_shape_mn[0] * self.cluster_shape_mn[1]
-        print(f"Initialized CUTLASSGroupedGemmStrategy for Blackwell with:")
-        print(f"  - 2 CTA instructions: {self.use_2cta_instrs}")
-        print(f"  - MMA tiler (M, N): {self.mma_tiler_mn}")
-        print(f"  - Cluster shape (M, N): {self.cluster_shape_mn}")
-        print(f"  - Cluster size: {cluster_size}")
-        if cluster_size > 1:
-            print(f"  - Using multi-CTA parallelism")
-
-    def arrange_expert_weights(self, all_weights, submod_name, module):
-        """Store weights in stacked format."""
-        return torch.stack(all_weights)
-
-    def execute(self, contig_tokens, m_sizes, m_offsets, module):
-        """
-        Execute using CUTLASS grouped GEMM kernel - GPU-only version.
-
-        Args:
-            contig_tokens: Input tokens arranged contiguously by expert
-            m_sizes: Tensor of expert sizes (GPU tensor to avoid sync)
-            m_offsets: Tensor of expert offsets (GPU tensor to avoid sync)
-            module: MoE module containing weights
-        """
-        # Convert to GPU tensors if needed (avoid CPU-GPU sync)
-        m_sizes_gpu, m_offsets_gpu = self._ensure_gpu_tensors(
-            m_sizes, m_offsets, contig_tokens.device
-        )
-
-        # Validate inputs
-        # self._validate_inputs(contig_tokens, m_sizes_gpu, module)
-
-        # Get weights and device
-        weights = self._get_weights(module)
-        device = contig_tokens.device
-
-        # Prepare output tensor
-        output = torch.zeros(
-            contig_tokens.shape[0],
-            weights["gate"].shape[2],
-            dtype=self.DTYPE_TORCH,
-            device=device,
-        )
-
-        # Check for valid experts using GPU operations (no sync)
-        if not self._has_valid_experts_gpu(m_sizes_gpu):
-            return output
-
-        # Execute the three-stage computation using GPU-only operations
-        gate_outputs, up_outputs = self._execute_projections_gpu(
-            contig_tokens,
-            weights["gate"],
-            weights["up"],
-            m_sizes_gpu,
-            m_offsets_gpu,
-            device,
-        )
-
-        hidden_states = self._apply_activation_and_combine(gate_outputs, up_outputs)
-
-        final_outputs = self._execute_down_projection_gpu(
-            hidden_states, weights["down"], m_sizes_gpu, device
-        )
-
-        return self._reconstruct_output_gpu(
-            final_outputs, m_sizes_gpu, m_offsets_gpu, output
-        )
-
-    def _ensure_gpu_tensors(self, m_sizes, m_offsets, device):
-        """Ensure sizes and offsets are GPU tensors to avoid CPU-GPU sync."""
-        if not isinstance(m_sizes, torch.Tensor):
-            m_sizes_gpu = torch.tensor(m_sizes, dtype=torch.int32, device=device)
-        else:
-            m_sizes_gpu = m_sizes.to(device=device, dtype=torch.int32)
-
-        if not isinstance(m_offsets, torch.Tensor):
-            m_offsets_gpu = torch.tensor(m_offsets, dtype=torch.int32, device=device)
-        else:
-            m_offsets_gpu = m_offsets.to(device=device, dtype=torch.int32)
-
-        return m_sizes_gpu, m_offsets_gpu
-
-    def _has_valid_experts_gpu(self, m_sizes_gpu):
-        """Check if any experts have tokens using GPU operations (no sync)."""
-        return torch.any(
-            m_sizes_gpu > 0
-        ).item()  # Single sync here is unavoidable for control flow
-
-    def _validate_inputs(self, contig_tokens, m_sizes_gpu, module):
-        """Validate input parameters."""
-        if contig_tokens.dtype != self.DTYPE_TORCH:
-            raise ValueError(
-                f"Expected input dtype {self.DTYPE_TORCH}, got {contig_tokens.dtype}"
-            )
-
-        if len(contig_tokens.shape) != 2:
-            raise ValueError(
-                f"Expected 2D input tensor, got shape {contig_tokens.shape}"
-            )
-
-        required_params = ["gate_proj_weight", "up_proj_weight", "down_proj_weight"]
-        for param in required_params:
-            if not hasattr(module, param) or module.get_parameter(param) is None:
-                raise ValueError(f"Module missing required parameter: {param}")
-
-    def _get_weights(self, module):
-        """Extract and return weight tensors from module."""
-        return {
-            "gate": module.get_parameter("gate_proj_weight"),
-            "up": module.get_parameter("up_proj_weight"),
-            "down": module.get_parameter("down_proj_weight"),
-        }
-
-    def _execute_projections_gpu(
-        self, input_tokens, weight1, weight2, m_sizes_gpu, m_offsets_gpu, device
-    ):
-        """Execute gate and up projections using GPU-only operations."""
-        # Find valid experts using GPU operations
-        valid_mask = m_sizes_gpu > 0
-        valid_indices = torch.nonzero(valid_mask, as_tuple=True)[0]
-
-        if len(valid_indices) == 0:
-            return [], []
-
-        # Prepare metadata in batch using GPU operations
-        problem_sizes, strides_abc, ptrs_abc, gate_outputs, up_outputs = (
-            self._prepare_gate_up_metadata_gpu(
-                input_tokens,
-                weight1,
-                weight2,
-                m_sizes_gpu,
-                m_offsets_gpu,
-                valid_indices,
-                device,
-            )
-        )
-
-        if len(problem_sizes) == 0:
-            return [], []
-
-        # Execute grouped GEMM
-        self._execute_grouped_gemm(problem_sizes, strides_abc, ptrs_abc, device)
-
-        return gate_outputs, up_outputs
-
-    def _prepare_gate_up_metadata_gpu(
-        self,
-        input_tokens,
-        gate_weights,
-        up_weights,
-        m_sizes_gpu,
-        m_offsets_gpu,
-        valid_indices,
-        device,
-    ):
-        """Prepare metadata for gate and up projections"""
-        problem_sizes = []
-        strides_abc = []
-        ptrs_abc = []
-        gate_outputs = []
-        up_outputs = []
-
-        # Extract valid sizes and offsets (minimal sync - only for valid experts)
-        valid_sizes = m_sizes_gpu[valid_indices]
-        valid_offsets = (
-            m_offsets_gpu[valid_indices]
-            if len(m_offsets_gpu) > len(valid_indices)
-            else torch.cumsum(
-                torch.cat([torch.tensor([0], device=device), valid_sizes[:-1]]), dim=0
-            )
-        )
-
-        # Convert to Python for iteration (unavoidable in this test for metadata preparation)
-        valid_sizes_cpu = valid_sizes.cpu().tolist()
-        valid_offsets_cpu = valid_offsets.cpu().tolist()
-        valid_indices_cpu = valid_indices.cpu().tolist()
-
-        for i, (expert_idx, size, offset) in enumerate(
-            zip(valid_indices_cpu, valid_sizes_cpu, valid_offsets_cpu)
-        ):
-            if size > 0:
-                # Get expert data
-                expert_tokens = input_tokens[offset : offset + size].contiguous()
-                gate_weight = gate_weights[expert_idx].contiguous()
-                up_weight = up_weights[expert_idx].contiguous()
-
-                M, K = expert_tokens.shape
-                N = gate_weight.shape[0]
-                L = 1
-
-                # Create output tensors
-                gate_output = torch.empty(M, N, dtype=self.DTYPE_TORCH, device=device)
-                up_output = torch.empty(M, N, dtype=self.DTYPE_TORCH, device=device)
-
-                # Add both projections to metadata
-                for weight, output, output_list in [
-                    (gate_weight, gate_output, gate_outputs),
-                    (up_weight, up_output, up_outputs),
-                ]:
-                    self._add_projection_to_metadata(
-                        expert_tokens,
-                        weight,
-                        output,
-                        problem_sizes,
-                        strides_abc,
-                        ptrs_abc,
-                    )
-                    output_list.append(output)
-
-        return problem_sizes, strides_abc, ptrs_abc, gate_outputs, up_outputs
-
-    def _execute_down_projection_gpu(
-        self, hidden_states, down_weights, m_sizes_gpu, device
-    ):
-        """Execute down projection using GPU operations."""
-        if not hidden_states:
-            return []
-
-        # Find valid experts
-        valid_mask = m_sizes_gpu > 0
-        valid_indices = torch.nonzero(valid_mask, as_tuple=True)[0]
-
-        # Prepare metadata
-        problem_sizes, strides_abc, ptrs_abc, down_outputs = (
-            self._prepare_down_metadata_gpu(
-                hidden_states, down_weights, valid_indices, device
-            )
-        )
-
-        if len(problem_sizes) == 0:
-            return []
-
-        # Execute grouped GEMM
-        self._execute_grouped_gemm(problem_sizes, strides_abc, ptrs_abc, device)
-
-        return down_outputs
-
-    def _prepare_down_metadata_gpu(
-        self, hidden_states, down_weights, valid_indices, device
-    ):
-        """Prepare metadata for down projection using GPU operations."""
-        problem_sizes = []
-        strides_abc = []
-        ptrs_abc = []
-        down_outputs = []
-
-        # Convert indices to CPU for iteration (minimal sync)
-        valid_indices_cpu = valid_indices.cpu().tolist()
-
-        for i, expert_idx in enumerate(valid_indices_cpu):
-            if i < len(hidden_states):
-                hidden = hidden_states[i]
-                down_weight = down_weights[expert_idx].contiguous()
-
-                M, K = hidden.shape
-                N = down_weight.shape[0]
-
-                # Create output tensor
-                down_output = torch.empty(M, N, dtype=self.DTYPE_TORCH, device=device)
-                down_outputs.append(down_output)
-
-                # Add to metadata
-                self._add_projection_to_metadata(
-                    hidden,
-                    down_weight,
-                    down_output,
-                    problem_sizes,
-                    strides_abc,
-                    ptrs_abc,
-                )
-
-        return problem_sizes, strides_abc, ptrs_abc, down_outputs
-
-    def _add_projection_to_metadata(
-        self,
-        input_tensor,
-        weight_tensor,
-        output_tensor,
-        problem_sizes,
-        strides_abc,
-        ptrs_abc,
-    ):
-        """Add a single projection to the metadata lists."""
-        M, K = input_tensor.shape
-        N = weight_tensor.shape[0]
-        L = 1
-
-        # Convert to MNKL format
-        input_mnkl = input_tensor.unsqueeze(-1).contiguous()
-        weight_mnkl = weight_tensor.unsqueeze(-1).contiguous()
-        output_mnkl = output_tensor.unsqueeze(-1).contiguous()
-
-        # Extract strides
-        input_strides = list(input_mnkl.stride()[:2])
-        weight_strides = list(weight_mnkl.stride()[:2])
-        output_strides = list(output_mnkl.stride()[:2])
-
-        # Add to metadata
-        problem_sizes.append([M, N, K, L])
-        strides_abc.append([input_strides, weight_strides, output_strides])
-        ptrs_abc.append(
-            [
-                input_tensor.data_ptr(),
-                weight_tensor.data_ptr(),
-                output_tensor.data_ptr(),
-            ]
-        )
-
-    def _execute_grouped_gemm(self, problem_sizes, strides_abc, ptrs_abc, device):
-        """Execute the grouped GEMM kernel."""
-        num_groups = len(problem_sizes)
-
-        # Convert to CUTE tensors
-        problem_sizes_cute, strides_cute, ptrs_cute = self._convert_to_cute_tensors(
-            problem_sizes, strides_abc, ptrs_abc, device
-        )
-
-        # Get tensormap and compute clusters
-        tensormap_cute = self._get_tensormap_buffer(device)
-        total_clusters = self._compute_total_clusters(problem_sizes)
-
-        # Get initial tensors for compilation
-        initial_tensors = self._create_initial_tensors(problem_sizes[0], device)
-
-        # Compile or retrieve kernel
-        compiled_kernel = self._get_compiled_kernel(
-            num_groups,
-            total_clusters,
-            initial_tensors,
-            problem_sizes_cute,
-            strides_cute,
-            ptrs_cute,
-            tensormap_cute,
-        )
-
-        # Execute kernel
-        compiled_kernel(
-            *initial_tensors,
-            problem_sizes_cute,
-            strides_cute,
-            ptrs_cute,
-            tensormap_cute,
-            self.stream,
-        )
-        torch.cuda.synchronize()
-
-    def _convert_to_cute_tensors(self, problem_sizes, strides_abc, ptrs_abc, device):
-        """Convert metadata to CUTE tensors."""
-        problem_sizes_tensor = torch.tensor(
-            problem_sizes, dtype=torch.int32, device=device
-        )
-        strides_tensor = torch.tensor(strides_abc, dtype=torch.int32, device=device)
-        ptrs_tensor = torch.tensor(ptrs_abc, dtype=torch.int64, device=device)
-
-        return (
-            from_dlpack(problem_sizes_tensor, assumed_align=self.ALIGNMENT),
-            from_dlpack(strides_tensor, assumed_align=self.ALIGNMENT),
-            from_dlpack(ptrs_tensor, assumed_align=self.ALIGNMENT),
-        )
-
-    def _get_compiled_kernel(
-        self,
-        num_groups,
-        total_clusters,
-        initial_tensors,
-        problem_sizes_cute,
-        strides_cute,
-        ptrs_cute,
-        tensormap_cute,
-    ):
-        """Get or compile the grouped GEMM kernel."""
-        cache_key = (
-            num_groups,
-            total_clusters,
-            self.use_2cta_instrs,
-            self.mma_tiler_mn,
-            self.cluster_shape_mn,
-        )
-
-        if cache_key not in self._compiled_kernels:
-            print(
-                f"Compiling CUTLASS grouped GEMM kernel: {num_groups} groups, 2CTA={self.use_2cta_instrs}, cluster={self.cluster_shape_mn}"
-            )
-
-            self._compiled_kernels[cache_key] = cute.compile(
-                self.grouped_gemm,
-                *initial_tensors,
-                num_groups,
-                problem_sizes_cute,
-                strides_cute,
-                ptrs_cute,
-                total_clusters,
-                tensormap_cute,
-                self.max_active_clusters,
-                self.stream,
-            )
-            print("Kernel compilation successful")
-
-        return self._compiled_kernels[cache_key]
-
-    def _create_initial_tensors(self, problem_shape, device):
-        """Create initial CUTE tensors for kernel compilation."""
-        M, N, K, L = problem_shape
-
-        # Create tensors
-        tensors = [
-            torch.randn(M, K, dtype=self.DTYPE_TORCH, device=device),  # A
-            torch.randn(N, K, dtype=self.DTYPE_TORCH, device=device),  # B
-            torch.zeros(M, N, dtype=self.DTYPE_TORCH, device=device),  # C
-        ]
-
-        # Convert to MNKL format and create CUTE tensors
-        cute_tensors = []
-        for tensor in tensors:
-            mnkl_tensor = tensor.unsqueeze(-1).contiguous()
-            cute_tensor = from_dlpack(mnkl_tensor, assumed_align=self.ALIGNMENT)
-            cute_tensor.element_type = self.DTYPE_CUTLASS
-            cute_tensor = cute_tensor.mark_layout_dynamic(leading_dim=1)
-            cute_tensors.append(cute_tensor)
-
-        return cute_tensors
-
-    def _get_tensormap_buffer(self, device):
-        """Get or create tensormap buffer."""
-        if device not in self._tensormap_buffers:
-            sm_count = self.hardware_info.get_max_active_clusters(1)
-            tensormap_tensor = torch.zeros(
-                (sm_count, self.TENSORMAP_COUNT, self.TENSORMAP_BYTES // 8),
-                dtype=torch.int64,
-                device=device,
-            )
-            self._tensormap_buffers[device] = from_dlpack(
-                tensormap_tensor, assumed_align=self.ALIGNMENT
-            )
-
-        return self._tensormap_buffers[device]
-
-    def _compute_total_clusters(self, problem_sizes):
-        """Compute total number of clusters needed."""
-        cluster_tile_m = self.mma_tiler_mn[0]
-        cluster_tile_n = self.mma_tiler_mn[1]
-
-        # Adjust for 2 CTA mode and cluster shape
-        if self.use_2cta_instrs:
-            cluster_tile_m //= 2
-
-        cluster_tile_m *= self.cluster_shape_mn[0]
-        cluster_tile_n *= self.cluster_shape_mn[1]
-
-        total = 0
-        for M, N, K, L in problem_sizes:
-            clusters_m = (M + cluster_tile_m - 1) // cluster_tile_m
-            clusters_n = (N + cluster_tile_n - 1) // cluster_tile_n
-            total += clusters_m * clusters_n
-
-        return total
-
-    def _apply_activation_and_combine(self, gate_outputs, up_outputs):
-        """Apply activation and combine gate/up outputs."""
-        return [
-            self.activation_function(gate_out) * up_out
-            for gate_out, up_out in zip(gate_outputs, up_outputs)
-        ]
-
-    def _reconstruct_output_gpu(
-        self, final_outputs, m_sizes_gpu, m_offsets_gpu, output
-    ):
-        """Reconstruct the full output tensor using GPU operations (minimal sync)."""
-        if not final_outputs:
-            return output
-
-        # Find valid experts
-        valid_mask = m_sizes_gpu > 0
-        valid_indices = torch.nonzero(valid_mask, as_tuple=True)[0]
-        valid_sizes = m_sizes_gpu[valid_indices]
-
-        # Compute offsets if not provided properly
-        if len(m_offsets_gpu) <= len(valid_indices):
-            valid_offsets = torch.cumsum(
-                torch.cat(
-                    [torch.tensor([0], device=m_sizes_gpu.device), valid_sizes[:-1]]
-                ),
-                dim=0,
-            )
-        else:
-            valid_offsets = m_offsets_gpu[valid_indices]
-
-        # Convert to CPU for final reconstruction (minimal sync)
-        valid_sizes_cpu = valid_sizes.cpu().tolist()
-        valid_offsets_cpu = valid_offsets.cpu().tolist()
-
-        for i, (size, offset) in enumerate(zip(valid_sizes_cpu, valid_offsets_cpu)):
-            if i < len(final_outputs):
-                output[offset : offset + size] = final_outputs[i]
-
-        return output
-
-    @staticmethod
-    def is_available() -> bool:
-        return HAS_CUTLASS
-
-
-# ========================= end of CUTLASSGroupedGemmStrategy =========================
-
-
 class TritonCGBF16GroupGEMM(GroupGEMMStrategy):
     """Implementation of Triton Contiguous group Gemm"""
 

From 388de94d4f2a4262829d38c6fcb1745b9d2b87aa Mon Sep 17 00:00:00 2001
From: lessw2020 <lessw@etrillium.com>
Date: Sat, 21 Jun 2025 23:01:30 -0700
Subject: [PATCH 28/34] simpler standalone version

---
 .../deepseek_v3/cutlass_grouped_gemm.py       | 1139 +++--------------
 1 file changed, 180 insertions(+), 959 deletions(-)

diff --git a/torchtitan/experiments/deepseek_v3/cutlass_grouped_gemm.py b/torchtitan/experiments/deepseek_v3/cutlass_grouped_gemm.py
index bd2d540c9..a5031f7fb 100644
--- a/torchtitan/experiments/deepseek_v3/cutlass_grouped_gemm.py
+++ b/torchtitan/experiments/deepseek_v3/cutlass_grouped_gemm.py
@@ -1,43 +1,50 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
 """
-CUTLASS GroupedGEMM Strategy for Blackwell architecture.
+    Strategy using CUTLASS GroupedGemmKernel for group GEMM operations on Blackwell architecture.
 
-This module contains the CUTLASSGroupedGemmStrategy implementation that uses
-CUTLASS GroupedGemmKernel for high-performance group GEMM operations with
-minimal CPU-GPU synchronization and optional input validation.
+    Optimized version with pre-transposed weights to eliminate runtime transpose operations.
 """
 
 """
-[DEBUG] Gate/Up projection expert 14
-  - expert_tokens: torch.Size([12288, 2048])
-  - gate_weight (after .t()): torch.Size([2048, 1408])
-  - up_weight (after .t()): torch.Size([2048, 1408])
-[DEBUG] Matrix mult: input torch.Size([12288, 2048]) @ weight torch.Size([2048, 1408]) -> output torch.Size([12288, 1408])
-[DEBUG] Gate/Up projection expert 15[DEBUG] Matrix mult: input torch.Size([12288, 2048]) @ weight torch.Size([2048, 1408]) -> output torch.Size([12288, 1408])
-
-  - expert_tokens: torch.Size([12288, 2048])
-  - gate_weight (after .t()): torch.Size([2048, 1408])
-  - up_weight (after .t()): torch.Size([2048, 1408])
-[DEBUG] Matrix mult: input torch.Size([12288, 2048]) @ weight torch.Size([2048, 1408]) -> output torch.Size([12288, 1408])
-[DEBUG] Matrix mult: input torch.Size([12288, 2048]) @ weight torch.Size([2048, 1408]) -> output torch.Size([12288, 1408])
-down projection:
-[DEBUG] Down projection expert 10[DEBUG] Matrix mult: input torch.Size([12288, 1408]) @ weight torch.Size([1408, 2048]) -> output torch.Size([12288, 2048])
-
-[DEBUG] Down projection expert 10
+Shapes:
+Kernel compilation successful
+[DEBUG] Down projection expert 0 (optimized)
   - hidden: torch.Size([12288, 1408])
-  - hidden: torch.Size([12288, 1408])  - down_weight (after .t()): torch.Size([1408, 2048])
-
-  - down_weight (after .t()): torch.Size([1408, 2048])
-[DEBUG] Matrix mult: input torch.Size([12288, 1408]) @ weight torch.Size([1408, 2048]) -> output torch.Size([12288, 2048])
-[DEBUG] Matrix mult: input torch.Size([12288, 1408]) @ weight torch.Size([1408, 2048]) -> output torch.Size([12288, 2048])
-
+  - down_weight (pre-transposed): torch.Size([1408, 2048])
+[DEBUG] Matrix mult (optimized): input torch.Size([12288, 1408]) @ weight torch.Size([1408, 2048]) -> output torch.Size([12288, 2048])
+[DEBUG] Down projection expert 1 (optimized)
+  - hidden: torch.Size([12288, 1408])
+  - down_weight (pre-transposed): torch.Size([1408, 2048])
+[DEBUG] Matrix mult (optimized): input torch.Size([12288, 1408]) @ weight torch.Size([1408, 2048]) -> output torch.Size([12288, 2048])
+[DEBUG] Down projection expert 2 (optimized)
+  - hidden: torch.Size([12288, 1408])
+  - down_weight (pre-transposed): torch.Size([1408, 2048])
+2025-06-21 22:58:46,021 - INFO - cuModuleLoadData 1080453344
+[DEBUG] Matrix mult (optimized): input torch.Size([12288, 1408]) @ weight torch.Size([1408, 2048]) -> output torch.Size([12288, 2048])
+[DEBUG] Down projection expert 3 (optimized)
+  - hidden: torch.Size([12288, 1408])
+  - down_weight (pre-transposed): torch.Size([1408, 2048])
+2025-06-21 22:58:46,021 - INFO - cuModuleGetFunction <CUmodule 0x4040a660> kernel_cutlass_kernel_torchtitanexperimentskernelsblackwellcute_grouped_gemmGroupedGemmKernel_object_at__TiledMMA_ThrLayoutVMNK21111000_PermutationMNK____MMAAtom_ThrID21_ShapeMNK25612816__0
+2025-06-21 22:58:46,021 - INFO - <CUfunction 0x177c2870> <-- cuModuleGetFunction
+[DEBUG] Matrix mult (optimized): input torch.Size([12288, 1408]) @ weight torch.Size([1408, 2048]) -> output torch.Size([12288, 2048])
+[DEBUG] Down projection expert 4 (optimized)
+  - hidden: torch.Size([12288, 1408])
+  - down_weight (pre-transposed): torch.Size([1408, 2048])
+[DEBUG] Matrix mult (optimized): input torch.Size([12288, 1408]) @ weight torch.Size([1408, 2048]) -> output torch.Size([12288, 2048])
+[DEBUG] Down projection expert 5 (optimized)
+  - hidden: torch.Size([12288, 1408])
+  - down_weight (pre-transposed): torch.Size([1408, 2048])
+[DEBUG] Matrix mult (optimized): input torch.Size([12288, 1408]) @ weight torch.Size([1408, 2048]) -> output torch.Size([12288, 2048])
+[DEBUG] Down projection expert 6 (optimized)
+  - hidden: torch.Size([12288, 1408])
+  - down_weight (pre-transposed): torch.Size([1408, 2048])
+[DEBUG] Matrix mult (optimized): input torch.Size([12288, 1408]) @ weight torch.Size([1408, 2048]) -> output torch.Size([12288, 2048])
+[DEBUG] Down projection expert 7 (optimized)
+  - hidden: torch.Size([12288, 1408])
+  - down_weight (pre-transposed): torch.Size([1408, 2048])
 
 
+Error:
+Error using cutlass strategy: The expanded size of the tensor (1408) must match the existing size (2048) at non-singleton dimension 1.  Target sizes: [12288, 1408].  Tensor sizes: [12288, 2048]
 
 """
 
@@ -75,14 +82,8 @@
 logger = logging.getLogger(__name__)
 
 
-class CUTLASSGroupedGemmStrategy_prev(GroupGEMMStrategy):
-    """
-    Strategy using CUTLASS GroupedGemmKernel for group GEMM operations on Blackwell architecture.
-
-    This version eliminates CPU-GPU synchronization by keeping all size/offset computations on GPU.
-    """
-
-    # Constants (same as before)
+class CUTLASSGroupedGemmStrategy(GroupGEMMStrategy):
+    # Constants
     SUPPORTED_CLUSTER_SHAPES = [
         (1, 1),
         (1, 2),
@@ -121,9 +122,6 @@ def __init__(
         self.mma_tiler_mn = mma_tiler_mn or self._get_default_mma_tiler()
         self.cluster_shape_mn = cluster_shape_mn or self._get_default_cluster_shape()
 
-        # Validate configurations
-        # self._validate_configurations()
-
         # Initialize kernel and hardware info
         self._initialize_kernel()
         self._initialize_hardware()
@@ -162,50 +160,6 @@ def _initialize_hardware(self):
         torch_stream = torch.cuda.current_stream()
         self.stream = cuda.CUstream(torch_stream.cuda_stream)
 
-    def _validate_configurations(self):
-        """Validate configurations for Blackwell."""
-        self._validate_mma_tiler()
-        self._validate_cluster_shape()
-        self._validate_2cta_constraints()
-
-    def _validate_mma_tiler(self):
-        """Validate MMA tiler configuration."""
-        m_size, n_size = self.mma_tiler_mn
-
-        valid_m_sizes = (
-            self.DUAL_CTA_M_SIZES if self.use_2cta_instrs else self.SINGLE_CTA_M_SIZES
-        )
-        mode_name = "2 CTA" if self.use_2cta_instrs else "single CTA"
-
-        if m_size not in valid_m_sizes:
-            raise ValueError(
-                f"For {mode_name} mode on Blackwell, MMA tiler M must be in {valid_m_sizes}, got {m_size}"
-            )
-
-        if n_size not in self.N_SIZE_RANGE:
-            raise ValueError(
-                f"MMA tiler N must be in range [32, 256] with step 32, got {n_size}"
-            )
-
-    def _validate_cluster_shape(self):
-        """Validate cluster shape configuration."""
-        if self.cluster_shape_mn not in self.SUPPORTED_CLUSTER_SHAPES:
-            raise ValueError(
-                f"Cluster shape {self.cluster_shape_mn} not supported on Blackwell. "
-                f"Valid cluster shapes are: {self.SUPPORTED_CLUSTER_SHAPES}"
-            )
-
-    def _validate_2cta_constraints(self):
-        """Validate 2 CTA specific constraints."""
-        if self.use_2cta_instrs and self.cluster_shape_mn[0] % 2 != 0:
-            valid_2cta_shapes = [
-                shape for shape in self.SUPPORTED_CLUSTER_SHAPES if shape[0] % 2 == 0
-            ]
-            raise ValueError(
-                f"For 2 CTA mode, cluster shape M must be even, got {self.cluster_shape_mn[0]}. "
-                f"Valid 2 CTA cluster shapes: {valid_2cta_shapes}"
-            )
-
     def _log_initialization(self):
         """Log initialization information."""
         cluster_size = self.cluster_shape_mn[0] * self.cluster_shape_mn[1]
@@ -214,52 +168,95 @@ def _log_initialization(self):
         print(f"  - MMA tiler (M, N): {self.mma_tiler_mn}")
         print(f"  - Cluster shape (M, N): {self.cluster_shape_mn}")
         print(f"  - Cluster size: {cluster_size}")
+        print(f"  - Weight optimization: Pre-transposed (no runtime transpose)")
         if cluster_size > 1:
             print(f"  - Using multi-CTA parallelism")
 
     def arrange_expert_weights(self, all_weights, submod_name, module):
-        """Store weights in stacked format."""
-        return torch.stack(all_weights)
+        """
+        Store weights in stacked format with pre-transposition for optimal GEMM performance.
+
+        This eliminates the need for runtime transpose operations.
+
+        Original PyTorch weight shapes:
+        - gate_proj_weight: [intermediate_size, hidden_size]
+        - up_proj_weight: [intermediate_size, hidden_size]
+        - down_proj_weight: [hidden_size, intermediate_size]
+
+        Pre-transposed shapes for direct GEMM usage:
+        - gate_proj_weight: [hidden_size, intermediate_size] (transposed)
+        - up_proj_weight: [hidden_size, intermediate_size] (transposed)
+        - down_proj_weight: [intermediate_size, hidden_size] (transposed)
+        """
+        print(f"[arrange_expert_weights] Processing {submod_name}")
+
+        # Determine if this weight needs transposition based on submodule name
+        needs_transpose = submod_name in ["gate_proj_weight", "up_proj_weight"]
+
+        transposed_weights = []
+        for i, weight in enumerate(all_weights):
+            original_shape = weight.shape
+
+            if needs_transpose:
+                # Transpose gate/up weights: [intermediate_size, hidden_size] -> [hidden_size, intermediate_size]
+                transposed_weight = weight.t().contiguous()
+                print(
+                    f"[arrange_expert_weights] {submod_name} expert {i}: {original_shape} -> {transposed_weight.shape} (transposed)"
+                )
+            else:
+                # Keep down weights as-is for now, will transpose during stacking
+                # down_proj_weight: [hidden_size, intermediate_size] -> [intermediate_size, hidden_size]
+                transposed_weight = weight.t().contiguous()
+                print(
+                    f"[arrange_expert_weights] {submod_name} expert {i}: {original_shape} -> {transposed_weight.shape} (transposed)"
+                )
+
+            transposed_weights.append(transposed_weight)
+
+        # Stack all transposed weights
+        stacked = torch.stack(transposed_weights)
+        print(
+            f"[arrange_expert_weights] {submod_name} final stacked shape: {stacked.shape}"
+        )
+
+        return stacked
 
     def execute(self, contig_tokens, m_sizes, m_offsets, module):
         """
-        Execute using CUTLASS grouped GEMM kernel - GPU-only version.
+        Execute using CUTLASS grouped GEMM kernel with pre-transposed weights.
 
         Args:
             contig_tokens: Input tokens arranged contiguously by expert
             m_sizes: Tensor of expert sizes (GPU tensor to avoid sync)
             m_offsets: Tensor of expert offsets (GPU tensor to avoid sync)
-            module: MoE module containing weights
+            module: MoE module containing pre-transposed weights
         """
         # Convert to GPU tensors if needed (avoid CPU-GPU sync)
         m_sizes_gpu, m_offsets_gpu = self._ensure_gpu_tensors(
             m_sizes, m_offsets, contig_tokens.device
         )
 
-        # Validate inputs
-        # self._validate_inputs(contig_tokens, m_sizes_gpu, module)
-
-        # Get weights and device
+        # Get pre-transposed weights and device
         weights = self._get_weights(module)
         device = contig_tokens.device
 
-        # Prepare output tensor
+        # Prepare output tensor using down projection output size
         output = torch.zeros(
             contig_tokens.shape[0],
-            weights["gate"].shape[2],
+            weights["down"].shape[1],  # hidden_size (after transpose)
             dtype=self.DTYPE_TORCH,
             device=device,
         )
 
-        # Check for valid experts using GPU operations (no sync)
+        # Check for valid experts using GPU operations (minimal sync)
         if not self._has_valid_experts_gpu(m_sizes_gpu):
             return output
 
-        # Execute the three-stage computation using GPU-only operations
+        # Execute the three-stage computation with pre-transposed weights
         gate_outputs, up_outputs = self._execute_projections_gpu(
             contig_tokens,
-            weights["gate"],
-            weights["up"],
+            weights["gate"],  # Already transposed to [hidden_size, intermediate_size]
+            weights["up"],  # Already transposed to [hidden_size, intermediate_size]
             m_sizes_gpu,
             m_offsets_gpu,
             device,
@@ -268,7 +265,10 @@ def execute(self, contig_tokens, m_sizes, m_offsets, module):
         hidden_states = self._apply_activation_and_combine(gate_outputs, up_outputs)
 
         final_outputs = self._execute_down_projection_gpu(
-            hidden_states, weights["down"], m_sizes_gpu, device
+            hidden_states,
+            weights["down"],  # Already transposed to [intermediate_size, hidden_size]
+            m_sizes_gpu,
+            device,
         )
 
         return self._reconstruct_output_gpu(
@@ -290,40 +290,27 @@ def _ensure_gpu_tensors(self, m_sizes, m_offsets, device):
         return m_sizes_gpu, m_offsets_gpu
 
     def _has_valid_experts_gpu(self, m_sizes_gpu):
-        """Check if any experts have tokens using GPU operations (no sync)."""
-        return torch.any(
-            m_sizes_gpu > 0
-        ).item()  # Single sync here is unavoidable for control flow
-
-    def _validate_inputs(self, contig_tokens, m_sizes_gpu, module):
-        """Validate input parameters."""
-        if contig_tokens.dtype != self.DTYPE_TORCH:
-            raise ValueError(
-                f"Expected input dtype {self.DTYPE_TORCH}, got {contig_tokens.dtype}"
-            )
-
-        if len(contig_tokens.shape) != 2:
-            raise ValueError(
-                f"Expected 2D input tensor, got shape {contig_tokens.shape}"
-            )
-
-        required_params = ["gate_proj_weight", "up_proj_weight", "down_proj_weight"]
-        for param in required_params:
-            if not hasattr(module, param) or module.get_parameter(param) is None:
-                raise ValueError(f"Module missing required parameter: {param}")
+        """Check if any experts have tokens using GPU operations (minimal sync)."""
+        return torch.any(m_sizes_gpu > 0).item()
 
     def _get_weights(self, module):
-        """Extract and return weight tensors from module."""
+        """Extract pre-transposed weight tensors from module."""
         return {
-            "gate": module.get_parameter("gate_proj_weight"),
-            "up": module.get_parameter("up_proj_weight"),
-            "down": module.get_parameter("down_proj_weight"),
+            "gate": module.get_parameter(
+                "gate_proj_weight"
+            ),  # Pre-transposed to [num_experts, hidden_size, intermediate_size]
+            "up": module.get_parameter(
+                "up_proj_weight"
+            ),  # Pre-transposed to [num_experts, hidden_size, intermediate_size]
+            "down": module.get_parameter(
+                "down_proj_weight"
+            ),  # Pre-transposed to [num_experts, intermediate_size, hidden_size]
         }
 
     def _execute_projections_gpu(
         self, input_tokens, weight1, weight2, m_sizes_gpu, m_offsets_gpu, device
     ):
-        """Execute gate and up projections using GPU-only operations."""
+        """Execute gate and up projections using pre-transposed weights."""
         # Find valid experts using GPU operations
         valid_mask = m_sizes_gpu > 0
         valid_indices = torch.nonzero(valid_mask, as_tuple=True)[0]
@@ -331,12 +318,12 @@ def _execute_projections_gpu(
         if len(valid_indices) == 0:
             return [], []
 
-        # Prepare metadata in batch using GPU operations
+        # Prepare metadata with pre-transposed weights
         problem_sizes, strides_abc, ptrs_abc, gate_outputs, up_outputs = (
             self._prepare_gate_up_metadata_gpu(
                 input_tokens,
-                weight1,
-                weight2,
+                weight1,  # Pre-transposed gate weights
+                weight2,  # Pre-transposed up weights
                 m_sizes_gpu,
                 m_offsets_gpu,
                 valid_indices,
@@ -362,7 +349,7 @@ def _prepare_gate_up_metadata_gpu(
         valid_indices,
         device,
     ):
-        """Prepare metadata for gate and up projections"""
+        """Prepare metadata for gate and up projections using pre-transposed weights."""
         problem_sizes = []
         strides_abc = []
         ptrs_abc = []
@@ -379,7 +366,7 @@ def _prepare_gate_up_metadata_gpu(
             )
         )
 
-        # Convert to Python for iteration (unavoidable in this test for metadata preparation)
+        # Convert to Python for iteration (unavoidable for metadata preparation)
         valid_sizes_cpu = valid_sizes.cpu().tolist()
         valid_offsets_cpu = valid_offsets.cpu().tolist()
         valid_indices_cpu = valid_indices.cpu().tolist()
@@ -388,14 +375,30 @@ def _prepare_gate_up_metadata_gpu(
             zip(valid_indices_cpu, valid_sizes_cpu, valid_offsets_cpu)
         ):
             if size > 0:
-                # Get expert data
+                # Get expert data and PRE-TRANSPOSED weights (no runtime transpose needed!)
                 expert_tokens = input_tokens[offset : offset + size].contiguous()
-                gate_weight = gate_weights[expert_idx].contiguous()
-                up_weight = up_weights[expert_idx].contiguous()
+                gate_weight = gate_weights[
+                    expert_idx
+                ].contiguous()  # Already [hidden_size, intermediate_size]
+                up_weight = up_weights[
+                    expert_idx
+                ].contiguous()  # Already [hidden_size, intermediate_size]
+
+                print(f"[DEBUG] Gate/Up projection expert {expert_idx} (optimized)")
+                print(f"  - expert_tokens: {expert_tokens.shape}")
+                print(f"  - gate_weight (pre-transposed): {gate_weight.shape}")
+                print(f"  - up_weight (pre-transposed): {up_weight.shape}")
+
+                M, K = expert_tokens.shape  # M = batch_size, K = hidden_size
+                K_weight, N = (
+                    gate_weight.shape
+                )  # K_weight = hidden_size, N = intermediate_size
 
-                M, K = expert_tokens.shape
-                N = gate_weight.shape[0]
-                L = 1
+                # Verify dimension compatibility
+                if K != K_weight:
+                    raise ValueError(
+                        f"Dimension mismatch: input has {K} features but weight expects {K_weight}"
+                    )
 
                 # Create output tensors
                 gate_output = torch.empty(M, N, dtype=self.DTYPE_TORCH, device=device)
@@ -421,7 +424,7 @@ def _prepare_gate_up_metadata_gpu(
     def _execute_down_projection_gpu(
         self, hidden_states, down_weights, m_sizes_gpu, device
     ):
-        """Execute down projection using GPU operations."""
+        """Execute down projection using pre-transposed weights."""
         if not hidden_states:
             return []
 
@@ -447,7 +450,7 @@ def _execute_down_projection_gpu(
     def _prepare_down_metadata_gpu(
         self, hidden_states, down_weights, valid_indices, device
     ):
-        """Prepare metadata for down projection using GPU operations."""
+        """Prepare metadata for down projection using pre-transposed weights."""
         problem_sizes = []
         strides_abc = []
         ptrs_abc = []
@@ -459,10 +462,24 @@ def _prepare_down_metadata_gpu(
         for i, expert_idx in enumerate(valid_indices_cpu):
             if i < len(hidden_states):
                 hidden = hidden_states[i]
-                down_weight = down_weights[expert_idx].contiguous()
+                down_weight = down_weights[
+                    expert_idx
+                ].contiguous()  # Already [intermediate_size, hidden_size]
+
+                print(f"[DEBUG] Down projection expert {expert_idx} (optimized)")
+                print(f"  - hidden: {hidden.shape}")
+                print(f"  - down_weight (pre-transposed): {down_weight.shape}")
+
+                M, K = hidden.shape  # M = batch_size, K = intermediate_size
+                K_weight, N = (
+                    down_weight.shape
+                )  # K_weight = intermediate_size, N = hidden_size
 
-                M, K = hidden.shape
-                N = down_weight.shape[0]
+                # Verify dimension compatibility
+                if K != K_weight:
+                    raise ValueError(
+                        f"Dimension mismatch: hidden has {K} features but weight expects {K_weight}"
+                    )
 
                 # Create output tensor
                 down_output = torch.empty(M, N, dtype=self.DTYPE_TORCH, device=device)
@@ -489,11 +506,21 @@ def _add_projection_to_metadata(
         strides_abc,
         ptrs_abc,
     ):
-        """Add a single projection to the metadata lists."""
+        """Add a single projection to the metadata lists (weights are pre-transposed)."""
         M, K = input_tensor.shape
-        N = weight_tensor.shape[0]
+        K_weight, N = weight_tensor.shape
         L = 1
 
+        print(
+            f"[DEBUG] Matrix mult (optimized): input {input_tensor.shape} @ weight {weight_tensor.shape} -> output {output_tensor.shape}"
+        )
+
+        # Verify dimension compatibility
+        if K != K_weight:
+            raise ValueError(
+                f"Matrix multiplication dimension mismatch: {K} != {K_weight}"
+            )
+
         # Convert to MNKL format
         input_mnkl = input_tensor.unsqueeze(-1).contiguous()
         weight_mnkl = weight_tensor.unsqueeze(-1).contiguous()
@@ -515,6 +542,7 @@ def _add_projection_to_metadata(
             ]
         )
 
+    # Rest of the methods remain the same...
     def _execute_grouped_gemm(self, problem_sizes, strides_abc, ptrs_abc, device):
         """Execute the grouped GEMM kernel."""
         num_groups = len(problem_sizes)
@@ -588,7 +616,8 @@ def _get_compiled_kernel(
 
         if cache_key not in self._compiled_kernels:
             print(
-                f"Compiling CUTLASS grouped GEMM kernel: {num_groups} groups, 2CTA={self.use_2cta_instrs}, cluster={self.cluster_shape_mn}"
+                f"Compiling CUTLASS grouped GEMM kernel: {num_groups} groups, "
+                f"2CTA={self.use_2cta_instrs}, cluster={self.cluster_shape_mn}"
             )
 
             self._compiled_kernels[cache_key] = cute.compile(
@@ -611,10 +640,12 @@ def _create_initial_tensors(self, problem_shape, device):
         """Create initial CUTE tensors for kernel compilation."""
         M, N, K, L = problem_shape
 
-        # Create tensors
+        # Create tensors (weights are already in correct transposed format)
         tensors = [
             torch.randn(M, K, dtype=self.DTYPE_TORCH, device=device),  # A
-            torch.randn(N, K, dtype=self.DTYPE_TORCH, device=device),  # B
+            torch.randn(
+                K, N, dtype=self.DTYPE_TORCH, device=device
+            ),  # B (pre-transposed)
             torch.zeros(M, N, dtype=self.DTYPE_TORCH, device=device),  # C
         ]
 
@@ -641,7 +672,6 @@ def _get_tensormap_buffer(self, device):
             self._tensormap_buffers[device] = from_dlpack(
                 tensormap_tensor, assumed_align=self.ALIGNMENT
             )
-
         return self._tensormap_buffers[device]
 
     def _compute_total_clusters(self, problem_sizes):
@@ -707,812 +737,3 @@ def _reconstruct_output_gpu(
     @staticmethod
     def is_available() -> bool:
         return HAS_CUTLASS
-
-
-# ========================= end of CUTLASSGroupedGemmStrategy =========================
-
-
-class CUTLASSGroupedGemmStrategy_incorrect(GroupGEMMStrategy):
-    """
-    Strategy using CUTLASS GroupedGemmKernel for group GEMM operations on Blackwell architecture.
-
-    """
-
-    # Constants for Blackwell architecture support
-    SUPPORTED_CLUSTER_SHAPES = [
-        (1, 1),
-        (1, 2),
-        (1, 4),
-        (2, 1),
-        (2, 2),
-        (2, 4),
-        (4, 1),
-        (4, 2),
-        (4, 4),
-    ]
-
-    SINGLE_CTA_M_SIZES = [128, 64]
-    DUAL_CTA_M_SIZES = [256, 128]
-    N_SIZE_RANGE = range(32, 257, 32)  # 32 - 256, step 32
-
-    DTYPE_TORCH = torch.bfloat16
-    DTYPE_CUTLASS = cutlass.BFloat16
-    ACC_DTYPE = cutlass.Float32
-    ALIGNMENT = 16
-    TENSORMAP_COUNT = 3
-    TENSORMAP_BYTES = 128
-
-    def __init__(
-        self,
-        custom_activation,
-        use_2cta_instrs=True,
-        mma_tiler_mn=(256, 128),
-        cluster_shape_mn=(2, 2),
-        validate=False,
-        debug_shapes=False,
-    ):
-        """
-        Initialize the CUTLASS grouped GEMM strategy for Blackwell architecture.
-
-        Args:
-            custom_activation: Activation function to use (e.g., SiLU)
-            use_2cta_instrs: Whether to use 2 CTA instructions for better performance
-            mma_tiler_mn: MMA tiler configuration (M, N)
-            cluster_shape_mn: Cluster shape configuration (M, N)
-            validate: Whether to validate inputs (disable for performance in production)
-            debug_shapes: Whether to log tensor shapes for debugging dimension mismatches
-        """
-        super().__init__(custom_activation)
-        self.use_2cta_instrs = use_2cta_instrs
-        self.validate = validate
-        self.debug_shapes = debug_shapes
-
-        # Set configuration
-        self.mma_tiler_mn = mma_tiler_mn or self._get_default_mma_tiler()
-        self.cluster_shape_mn = cluster_shape_mn or self._get_default_cluster_shape()
-
-        # Validate configurations only if validation is enabled
-        if self.validate:
-            self._validate_configurations()
-
-        # Initialize kernel and hardware info
-        self._initialize_kernel()
-        self._initialize_hardware()
-
-        # Initialize caches
-        self._compiled_kernels = {}
-        self._tensormap_buffers = {}
-
-        self._log_initialization()
-
-    def _get_default_mma_tiler(self):
-        """Get default MMA tiler configuration based on CTA mode."""
-        return (256, 128) if self.use_2cta_instrs else (128, 128)
-
-    def _get_default_cluster_shape(self):
-        """Get default cluster shape based on CTA mode."""
-        return (2, 2) if self.use_2cta_instrs else (1, 1)
-
-    def _initialize_kernel(self):
-        """Initialize the CUTLASS grouped GEMM kernel."""
-        self.grouped_gemm = GroupedGemmKernel(
-            acc_dtype=self.ACC_DTYPE,
-            use_2cta_instrs=self.use_2cta_instrs,
-            mma_tiler_mn=self.mma_tiler_mn,
-            cluster_shape_mn=self.cluster_shape_mn,
-            tensormap_update_mode=utils.TensorMapUpdateMode.SMEM,
-        )
-
-    def _initialize_hardware(self):
-        """Initialize hardware information and stream."""
-        self.hardware_info = utils.HardwareInfo()
-        self.max_active_clusters = self.hardware_info.get_max_active_clusters(
-            self.cluster_shape_mn[0] * self.cluster_shape_mn[1]
-        )
-
-        torch_stream = torch.cuda.current_stream()
-        self.stream = cuda.CUstream(torch_stream.cuda_stream)
-
-    def _validate_configurations(self):
-        """Validate configurations for Blackwell."""
-        self._validate_mma_tiler()
-        self._validate_cluster_shape()
-        self._validate_2cta_constraints()
-
-    def _validate_mma_tiler(self):
-        """Validate MMA tiler configuration."""
-        m_size, n_size = self.mma_tiler_mn
-
-        valid_m_sizes = (
-            self.DUAL_CTA_M_SIZES if self.use_2cta_instrs else self.SINGLE_CTA_M_SIZES
-        )
-        mode_name = "2 CTA" if self.use_2cta_instrs else "single CTA"
-
-        if m_size not in valid_m_sizes:
-            raise ValueError(
-                f"For {mode_name} mode on Blackwell, MMA tiler M must be in {valid_m_sizes}, got {m_size}"
-            )
-
-        if n_size not in self.N_SIZE_RANGE:
-            raise ValueError(
-                f"MMA tiler N must be in range [32, 256] with step 32, got {n_size}"
-            )
-
-    def _validate_cluster_shape(self):
-        """Validate cluster shape configuration."""
-        if self.cluster_shape_mn not in self.SUPPORTED_CLUSTER_SHAPES:
-            raise ValueError(
-                f"Cluster shape {self.cluster_shape_mn} not supported on Blackwell. "
-                f"Valid cluster shapes are: {self.SUPPORTED_CLUSTER_SHAPES}"
-            )
-
-    def _validate_2cta_constraints(self):
-        """Validate 2 CTA specific constraints."""
-        if self.use_2cta_instrs and self.cluster_shape_mn[0] % 2 != 0:
-            valid_2cta_shapes = [
-                shape for shape in self.SUPPORTED_CLUSTER_SHAPES if shape[0] % 2 == 0
-            ]
-            raise ValueError(
-                f"For 2 CTA mode, cluster shape M must be even, got {self.cluster_shape_mn[0]}. "
-                f"Valid 2 CTA cluster shapes: {valid_2cta_shapes}"
-            )
-
-    def _log_initialization(self):
-        """Log initialization information."""
-        cluster_size = self.cluster_shape_mn[0] * self.cluster_shape_mn[1]
-        logger.info(f"Initialized CUTLASSGroupedGemmStrategy for Blackwell with:")
-        logger.info(f"  - 2 CTA instructions: {self.use_2cta_instrs}")
-        logger.info(f"  - MMA tiler (M, N): {self.mma_tiler_mn}")
-        logger.info(f"  - Cluster shape (M, N): {self.cluster_shape_mn}")
-        logger.info(f"  - Cluster size: {cluster_size}")
-        logger.info(f"  - Weight format: Standard PyTorch (runtime transpose)")
-        logger.info(
-            f"  - Input validation: {'Enabled' if self.validate else 'Disabled'}"
-        )
-        logger.info(f"  - CPU-GPU sync optimization: Enabled")
-        logger.info(
-            f"  - Debug shapes: {'Enabled' if self.debug_shapes else 'Disabled'}"
-        )
-        if cluster_size > 1:
-            logger.info(f"  - Using multi-CTA parallelism")
-
-    def _debug_log_shapes(self, message, **tensors):
-        """Log tensor shapes for debugging if debug_shapes is enabled"""
-        if self.debug_shapes:
-            shape_info = []
-            for name, tensor in tensors.items():
-                if hasattr(tensor, "shape"):
-                    shape_info.append(f"{name}: {tensor.shape}")
-                else:
-                    shape_info.append(f"{name}: {type(tensor)}")
-            logger.debug(f"[SHAPE DEBUG] {message} - {', '.join(shape_info)}")
-
-    def arrange_expert_weights(self, all_weights, submod_name, module):
-        """Store weights in stacked format (NO transpose - keep original PyTorch format)"""
-        # Keep original weight format for compatibility:
-        # gate/up: [intermediate_size, hidden_size]
-        # down: [hidden_size, intermediate_size]
-
-        # DEBUG: Print shapes to verify no transpose
-        print(f"[arrange_expert_weights] Processing {submod_name}")
-        for i, w in enumerate(all_weights):
-            print(f"[arrange_expert_weights] {submod_name} expert {i}: {w.shape}")
-
-        # NO TRANSPOSE - just stack the original weights
-        stacked = torch.stack(all_weights)
-        print(
-            f"[arrange_expert_weights] {submod_name} final stacked shape: {stacked.shape}"
-        )
-        return stacked
-
-    def execute(self, contig_tokens, m_sizes, m_offsets, module):
-        """
-        Execute using CUTLASS grouped GEMM kernel with standard PyTorch weight format.
-
-        Args:
-            contig_tokens: Input tokens arranged contiguously by expert
-            m_sizes: Tensor of expert sizes (GPU tensor to avoid sync)
-            m_offsets: Tensor of expert offsets (GPU tensor to avoid sync)
-            module: MoE module containing weights in standard PyTorch format
-        """
-        # Convert to GPU tensors if needed (avoid CPU-GPU sync)
-        m_sizes_gpu, m_offsets_gpu = self._ensure_gpu_tensors(
-            m_sizes, m_offsets, contig_tokens.device
-        )
-
-        # Validate inputs only if validation is enabled
-        if self.validate:
-            self._validate_inputs(contig_tokens, m_sizes_gpu, module)
-
-        # Get weights and device
-        weights = self._get_weights(module)
-        device = contig_tokens.device
-
-        # Debug logging - force print for visibility
-        if self.debug_shapes:
-            print(f"[DEBUG] Input tensors - contig_tokens: {contig_tokens.shape}")
-            print(f"[DEBUG] Gate weights: {weights['gate'].shape}")
-            print(f"[DEBUG] Up weights: {weights['up'].shape}")
-            print(f"[DEBUG] Down weights: {weights['down'].shape}")
-            print(f"[DEBUG] m_sizes_gpu: {m_sizes_gpu}")
-            print(f"[DEBUG] m_offsets_gpu: {m_offsets_gpu}")
-
-        # Prepare output tensor - use down projection weight shape for final output size
-        # Down weights are [num_experts, hidden_size, intermediate_size], so output is hidden_size
-        output = torch.zeros(
-            contig_tokens.shape[0],
-            weights["down"].shape[1],  # hidden_size from down projection
-            dtype=self.DTYPE_TORCH,
-            device=device,
-        )
-
-        # Check for valid experts using GPU operations (defer sync)
-        has_valid_experts = self._has_valid_experts_gpu(m_sizes_gpu)
-
-        # Early exit if no valid experts (minimal sync only when needed)
-        if not has_valid_experts.item():
-            return output
-
-        # Execute the three-stage computation using GPU-only operations
-        gate_outputs, up_outputs = self._execute_projections_gpu(
-            contig_tokens,
-            weights["gate"],
-            weights["up"],
-            m_sizes_gpu,
-            m_offsets_gpu,
-            device,
-        )
-
-        hidden_states = self._apply_activation_and_combine(gate_outputs, up_outputs)
-
-        final_outputs = self._execute_down_projection_gpu(
-            hidden_states, weights["down"], m_sizes_gpu, device
-        )
-
-        return self._reconstruct_output_gpu(
-            final_outputs, m_sizes_gpu, m_offsets_gpu, output
-        )
-
-    def _ensure_gpu_tensors(self, m_sizes, m_offsets, device):
-        """Ensure sizes and offsets are GPU tensors with minimal CPU-GPU sync"""
-        # Convert m_sizes
-        if not isinstance(m_sizes, torch.Tensor):
-            m_sizes_gpu = torch.tensor(m_sizes, dtype=torch.int32, device=device)
-        else:
-            # Only move if not already on correct device (avoids unnecessary transfer)
-            if m_sizes.device != device or m_sizes.dtype != torch.int32:
-                m_sizes_gpu = m_sizes.to(device=device, dtype=torch.int32)
-            else:
-                m_sizes_gpu = m_sizes
-
-        # Convert m_offsets
-        if not isinstance(m_offsets, torch.Tensor):
-            m_offsets_gpu = torch.tensor(m_offsets, dtype=torch.int32, device=device)
-        else:
-            # Only move if not already on correct device (avoids unnecessary transfer)
-            if m_offsets.device != device or m_offsets.dtype != torch.int32:
-                m_offsets_gpu = m_offsets.to(device=device, dtype=torch.int32)
-            else:
-                m_offsets_gpu = m_offsets
-
-        return m_sizes_gpu, m_offsets_gpu
-
-    def _has_valid_experts_gpu(self, m_sizes_gpu):
-        """Check if any experts have tokens using GPU operations (no sync)."""
-        # Return the tensor itself - let caller decide when to sync
-        return torch.any(m_sizes_gpu > 0)
-
-    def _validate_inputs(self, contig_tokens, m_sizes_gpu, module):
-        """Validate input parameters with minimal GPU sync"""
-        # Check dtype without sync (comparison is done on device info)
-        if contig_tokens.dtype != self.DTYPE_TORCH:
-            raise ValueError(
-                f"Expected input dtype {self.DTYPE_TORCH}, got {contig_tokens.dtype}"
-            )
-
-        # Check tensor dimensionality (no sync needed)
-        if len(contig_tokens.shape) != 2:
-            raise ValueError(
-                f"Expected 2D input tensor, got shape {contig_tokens.shape}"
-            )
-
-        # Check parameter existence (no sync needed)
-        required_params = ["gate_proj_weight", "up_proj_weight", "down_proj_weight"]
-        for param in required_params:
-            if not hasattr(module, param) or module.get_parameter(param) is None:
-                raise ValueError(f"Module missing required parameter: {param}")
-
-    def _get_weights(self, module):
-        """Extract and return weight tensors from module (original format, not transposed)."""
-        return {
-            "gate": module.get_parameter(
-                "gate_proj_weight"
-            ),  # [num_experts, intermediate_size, hidden_size]
-            "up": module.get_parameter(
-                "up_proj_weight"
-            ),  # [num_experts, intermediate_size, hidden_size]
-            "down": module.get_parameter(
-                "down_proj_weight"
-            ),  # [num_experts, hidden_size, intermediate_size]
-        }
-
-    def _execute_projections_gpu(
-        self, input_tokens, weight1, weight2, m_sizes_gpu, m_offsets_gpu, device
-    ):
-        """Execute gate and up projections using GPU-only operations."""
-        # Find valid experts using GPU operations
-        valid_mask = m_sizes_gpu > 0
-        valid_indices = torch.nonzero(valid_mask, as_tuple=True)[0]
-
-        if len(valid_indices) == 0:
-            return [], []
-
-        # Prepare metadata in batch using GPU operations
-        problem_sizes, strides_abc, ptrs_abc, gate_outputs, up_outputs = (
-            self._prepare_gate_up_metadata_gpu(
-                input_tokens,
-                weight1,
-                weight2,
-                m_sizes_gpu,
-                m_offsets_gpu,
-                valid_indices,
-                device,
-            )
-        )
-
-        if len(problem_sizes) == 0:
-            return [], []
-
-        # Execute grouped GEMM
-        self._execute_grouped_gemm(problem_sizes, strides_abc, ptrs_abc, device)
-
-        return gate_outputs, up_outputs
-
-    def _prepare_gate_up_metadata_gpu(
-        self,
-        input_tokens,
-        gate_weights,
-        up_weights,
-        m_sizes_gpu,
-        m_offsets_gpu,
-        valid_indices,
-        device,
-    ):
-        """Prepare metadata for gate and up projections with minimal CPU-GPU sync"""
-        problem_sizes = []
-        strides_abc = []
-        ptrs_abc = []
-        gate_outputs = []
-        up_outputs = []
-
-        # Extract valid sizes and offsets (keep on GPU as long as possible)
-        valid_sizes = m_sizes_gpu[valid_indices]
-        valid_offsets = (
-            m_offsets_gpu[valid_indices]
-            if len(m_offsets_gpu) > len(valid_indices)
-            else torch.cumsum(
-                torch.cat([torch.tensor([0], device=device), valid_sizes[:-1]]), dim=0
-            )
-        )
-
-        # Filter out zero-size experts on GPU before any CPU transfer
-        nonzero_mask = valid_sizes > 0
-        if not torch.any(nonzero_mask).item():  # Only sync needed for early exit
-            return problem_sizes, strides_abc, ptrs_abc, gate_outputs, up_outputs
-
-        # Apply mask to get final valid experts
-        final_valid_indices = valid_indices[nonzero_mask]
-        final_valid_sizes = valid_sizes[nonzero_mask]
-        final_valid_offsets = valid_offsets[nonzero_mask]
-
-        # Single batch CPU transfer at the end
-        final_indices_cpu = final_valid_indices.cpu()
-        final_sizes_cpu = final_valid_sizes.cpu()
-        final_offsets_cpu = final_valid_offsets.cpu()
-
-        # Convert to lists once
-        indices_list = final_indices_cpu.tolist()
-        sizes_list = final_sizes_cpu.tolist()
-        offsets_list = final_offsets_cpu.tolist()
-
-        # Now iterate with pre-transferred data
-        for expert_idx, size, offset in zip(indices_list, sizes_list, offsets_list):
-            # Get expert data
-            expert_tokens = input_tokens[offset : offset + size].contiguous()
-            # Original weight format: gate/up are [intermediate_size, hidden_size]
-            # Need to transpose for matrix multiplication: tokens @ weight.t()
-            gate_weight = (
-                gate_weights[expert_idx].t().contiguous()
-            )  # [hidden_size, intermediate_size]
-            up_weight = (
-                up_weights[expert_idx].t().contiguous()
-            )  # [hidden_size, intermediate_size]
-
-            if self.debug_shapes:
-                print(f"[DEBUG] Gate/Up projection expert {expert_idx}")
-                print(f"  - expert_tokens: {expert_tokens.shape}")
-                print(f"  - gate_weight (after .t()): {gate_weight.shape}")
-                print(f"  - up_weight (after .t()): {up_weight.shape}")
-
-            M, K = expert_tokens.shape  # M = batch_size, K = hidden_size
-            K_weight, N = (
-                gate_weight.shape
-            )  # K_weight = hidden_size, N = intermediate_size
-
-            # Verify dimension compatibility
-            if K != K_weight:
-                raise ValueError(
-                    f"Dimension mismatch in gate/up projections: "
-                    f"input tokens have {K} features but weight expects {K_weight}. "
-                    f"Tokens shape: {expert_tokens.shape}, Gate weight shape: {gate_weight.shape}"
-                )
-
-            # Create output tensors
-            gate_output = torch.empty(M, N, dtype=self.DTYPE_TORCH, device=device)
-            up_output = torch.empty(M, N, dtype=self.DTYPE_TORCH, device=device)
-
-            # Add both projections to metadata
-            for weight, output, output_list in [
-                (gate_weight, gate_output, gate_outputs),
-                (up_weight, up_output, up_outputs),
-            ]:
-                self._add_projection_to_metadata(
-                    expert_tokens,
-                    weight,
-                    output,
-                    problem_sizes,
-                    strides_abc,
-                    ptrs_abc,
-                )
-                output_list.append(output)
-
-        return problem_sizes, strides_abc, ptrs_abc, gate_outputs, up_outputs
-
-    def _execute_down_projection_gpu(
-        self, hidden_states, down_weights, m_sizes_gpu, device
-    ):
-        """Execute down projection using GPU operations."""
-        if not hidden_states:
-            return []
-
-        # Find valid experts
-        valid_mask = m_sizes_gpu > 0
-        valid_indices = torch.nonzero(valid_mask, as_tuple=True)[0]
-
-        # Prepare metadata
-        problem_sizes, strides_abc, ptrs_abc, down_outputs = (
-            self._prepare_down_metadata_gpu(
-                hidden_states, down_weights, valid_indices, device
-            )
-        )
-
-        if len(problem_sizes) == 0:
-            return []
-
-        # Execute grouped GEMM
-        self._execute_grouped_gemm(problem_sizes, strides_abc, ptrs_abc, device)
-
-        return down_outputs
-
-    def _prepare_down_metadata_gpu(
-        self, hidden_states, down_weights, valid_indices, device
-    ):
-        """Prepare metadata for down projection with minimal CPU-GPU sync"""
-        problem_sizes = []
-        strides_abc = []
-        ptrs_abc = []
-        down_outputs = []
-
-        # Filter valid indices to match hidden states length on GPU
-        num_hidden_states = len(hidden_states)
-        if num_hidden_states == 0:
-            return problem_sizes, strides_abc, ptrs_abc, down_outputs
-
-        # Limit valid indices to available hidden states (GPU operation)
-        valid_indices_limited = valid_indices[:num_hidden_states]
-
-        # Single batch CPU transfer
-        valid_indices_cpu = valid_indices_limited.cpu().tolist()
-
-        for i, expert_idx in enumerate(valid_indices_cpu):
-            if i < num_hidden_states:
-                hidden = hidden_states[i]
-                # Original down weight format: [hidden_size, intermediate_size]
-                # Need to transpose for matrix multiplication: hidden @ weight.t()
-                down_weight = (
-                    down_weights[expert_idx].t().contiguous()
-                )  # [intermediate_size, hidden_size]
-
-                if self.debug_shapes:
-                    print(f"[DEBUG] Down projection expert {expert_idx}")
-                    print(f"  - hidden: {hidden.shape}")
-                    print(f"  - down_weight (after .t()): {down_weight.shape}")
-
-                M, K = hidden.shape  # M = batch_size, K = intermediate_size
-                K_weight, N = (
-                    down_weight.shape
-                )  # K_weight = intermediate_size, N = hidden_size
-
-                # Verify dimension compatibility
-                if K != K_weight:
-                    raise ValueError(
-                        f"Dimension mismatch in down projection: "
-                        f"hidden states have {K} features but down_weight expects {K_weight} input features. "
-                        f"Hidden shape: {hidden.shape}, Down weight shape: {down_weight.shape}"
-                    )
-
-                # Create output tensor
-                down_output = torch.empty(M, N, dtype=self.DTYPE_TORCH, device=device)
-                down_outputs.append(down_output)
-
-                # Add to metadata
-                self._add_projection_to_metadata(
-                    hidden,
-                    down_weight,
-                    down_output,
-                    problem_sizes,
-                    strides_abc,
-                    ptrs_abc,
-                )
-
-        return problem_sizes, strides_abc, ptrs_abc, down_outputs
-
-    def _add_projection_to_metadata(
-        self,
-        input_tensor,
-        weight_tensor,
-        output_tensor,
-        problem_sizes,
-        strides_abc,
-        ptrs_abc,
-    ):
-        """Add a single projection to the metadata lists (weights are transposed at call site)."""
-        M, K = input_tensor.shape  # M = batch_size, K = input_features
-        K_weight, N = (
-            weight_tensor.shape
-        )  # K_weight = input_features, N = output_features
-        L = 1
-
-        # Debug print
-        if self.debug_shapes:
-            print(
-                f"[DEBUG] Matrix mult: input {input_tensor.shape} @ weight {weight_tensor.shape} -> output {output_tensor.shape}"
-            )
-
-        # Verify dimension compatibility for matrix multiplication
-        if K != K_weight:
-            raise ValueError(
-                f"Matrix multiplication dimension mismatch: "
-                f"input has {K} features but weight expects {K_weight} input features. "
-                f"Input shape: {input_tensor.shape}, Weight shape: {weight_tensor.shape}"
-            )
-
-        # Convert to MNKL format
-        input_mnkl = input_tensor.unsqueeze(-1).contiguous()
-        weight_mnkl = weight_tensor.unsqueeze(-1).contiguous()
-        output_mnkl = output_tensor.unsqueeze(-1).contiguous()
-
-        # Extract strides
-        input_strides = list(input_mnkl.stride()[:2])
-        weight_strides = list(weight_mnkl.stride()[:2])
-        output_strides = list(output_mnkl.stride()[:2])
-
-        # Add to metadata
-        problem_sizes.append([M, N, K, L])
-        strides_abc.append([input_strides, weight_strides, output_strides])
-        ptrs_abc.append(
-            [
-                input_tensor.data_ptr(),
-                weight_tensor.data_ptr(),
-                output_tensor.data_ptr(),
-            ]
-        )
-
-    def _execute_grouped_gemm(self, problem_sizes, strides_abc, ptrs_abc, device):
-        """Execute the grouped GEMM kernel."""
-        num_groups = len(problem_sizes)
-
-        # Convert to CUTE tensors
-        problem_sizes_cute, strides_cute, ptrs_cute = self._convert_to_cute_tensors(
-            problem_sizes, strides_abc, ptrs_abc, device
-        )
-
-        # Get tensormap and compute clusters
-        tensormap_cute = self._get_tensormap_buffer(device)
-        total_clusters = self._compute_total_clusters(problem_sizes)
-
-        # Get initial tensors for compilation
-        initial_tensors = self._create_initial_tensors(problem_sizes[0], device)
-
-        # Compile or retrieve kernel
-        compiled_kernel = self._get_compiled_kernel(
-            num_groups,
-            total_clusters,
-            initial_tensors,
-            problem_sizes_cute,
-            strides_cute,
-            ptrs_cute,
-            tensormap_cute,
-        )
-
-        # Execute kernel
-        compiled_kernel(
-            *initial_tensors,
-            problem_sizes_cute,
-            strides_cute,
-            ptrs_cute,
-            tensormap_cute,
-            self.stream,
-        )
-        torch.cuda.synchronize()
-
-    def _convert_to_cute_tensors(self, problem_sizes, strides_abc, ptrs_abc, device):
-        """Convert metadata to CUTE tensors."""
-        problem_sizes_tensor = torch.tensor(
-            problem_sizes, dtype=torch.int32, device=device
-        )
-        strides_tensor = torch.tensor(strides_abc, dtype=torch.int32, device=device)
-        ptrs_tensor = torch.tensor(ptrs_abc, dtype=torch.int64, device=device)
-
-        return (
-            from_dlpack(problem_sizes_tensor, assumed_align=self.ALIGNMENT),
-            from_dlpack(strides_tensor, assumed_align=self.ALIGNMENT),
-            from_dlpack(ptrs_tensor, assumed_align=self.ALIGNMENT),
-        )
-
-    def _get_compiled_kernel(
-        self,
-        num_groups,
-        total_clusters,
-        initial_tensors,
-        problem_sizes_cute,
-        strides_cute,
-        ptrs_cute,
-        tensormap_cute,
-    ):
-        """Get or compile the grouped GEMM kernel."""
-        cache_key = (
-            num_groups,
-            total_clusters,
-            self.use_2cta_instrs,
-            self.mma_tiler_mn,
-            self.cluster_shape_mn,
-        )
-
-        if cache_key not in self._compiled_kernels:
-            logger.info(
-                f"Compiling CUTLASS grouped GEMM kernel: {num_groups} groups, 2CTA={self.use_2cta_instrs}, cluster={self.cluster_shape_mn}"
-            )
-
-            self._compiled_kernels[cache_key] = cute.compile(
-                self.grouped_gemm,
-                *initial_tensors,
-                num_groups,
-                problem_sizes_cute,
-                strides_cute,
-                ptrs_cute,
-                total_clusters,
-                tensormap_cute,
-                self.max_active_clusters,
-                self.stream,
-            )
-            logger.info("Kernel compilation successful")
-
-        return self._compiled_kernels[cache_key]
-
-    def _create_initial_tensors(self, problem_shape, device):
-        """Create initial CUTE tensors for kernel compilation."""
-        M, N, K, L = problem_shape
-
-        # Create tensors with standard PyTorch layout (weights will be transposed at runtime)
-        tensors = [
-            torch.randn(M, K, dtype=self.DTYPE_TORCH, device=device),  # A (input)
-            torch.randn(
-                K, N, dtype=self.DTYPE_TORCH, device=device
-            ),  # B (transposed weight)
-            torch.zeros(M, N, dtype=self.DTYPE_TORCH, device=device),  # C (output)
-        ]
-
-        # Convert to MNKL format and create CUTE tensors
-        cute_tensors = []
-        for tensor in tensors:
-            mnkl_tensor = tensor.unsqueeze(-1).contiguous()
-            cute_tensor = from_dlpack(mnkl_tensor, assumed_align=self.ALIGNMENT)
-            cute_tensor.element_type = self.DTYPE_CUTLASS
-            cute_tensor = cute_tensor.mark_layout_dynamic(leading_dim=1)
-            cute_tensors.append(cute_tensor)
-
-        return cute_tensors
-
-    def _get_tensormap_buffer(self, device):
-        """Get or create tensormap buffer."""
-        if device not in self._tensormap_buffers:
-            sm_count = self.hardware_info.get_max_active_clusters(1)
-            tensormap_tensor = torch.zeros(
-                (sm_count, self.TENSORMAP_COUNT, self.TENSORMAP_BYTES // 8),
-                dtype=torch.int64,
-                device=device,
-            )
-            self._tensormap_buffers[device] = from_dlpack(
-                tensormap_tensor, assumed_align=self.ALIGNMENT
-            )
-
-        return self._tensormap_buffers[device]
-
-    def _compute_total_clusters(self, problem_sizes):
-        """Compute total number of clusters needed."""
-        cluster_tile_m = self.mma_tiler_mn[0]
-        cluster_tile_n = self.mma_tiler_mn[1]
-
-        # Adjust for 2 CTA mode and cluster shape
-        if self.use_2cta_instrs:
-            cluster_tile_m //= 2
-
-        cluster_tile_m *= self.cluster_shape_mn[0]
-        cluster_tile_n *= self.cluster_shape_mn[1]
-
-        total = 0
-        for M, N, K, L in problem_sizes:
-            clusters_m = (M + cluster_tile_m - 1) // cluster_tile_m
-            clusters_n = (N + cluster_tile_n - 1) // cluster_tile_n
-            total += clusters_m * clusters_n
-
-        return total
-
-    def _apply_activation_and_combine(self, gate_outputs, up_outputs):
-        """Apply activation and combine gate/up outputs."""
-        return [
-            self.activation_function(gate_out) * up_out
-            for gate_out, up_out in zip(gate_outputs, up_outputs)
-        ]
-
-    def _reconstruct_output_gpu(
-        self, final_outputs, m_sizes_gpu, m_offsets_gpu, output
-    ):
-        """Reconstruct the full output tensor with minimal CPU-GPU sync"""
-        if not final_outputs:
-            return output
-
-        # Find valid experts on GPU
-        valid_mask = m_sizes_gpu > 0
-        valid_indices = torch.nonzero(valid_mask, as_tuple=True)[0]
-        valid_sizes = m_sizes_gpu[valid_indices]
-
-        # Filter to match final_outputs length
-        num_outputs = len(final_outputs)
-        if num_outputs == 0:
-            return output
-
-        # Limit to available outputs
-        valid_indices_limited = valid_indices[:num_outputs]
-        valid_sizes_limited = valid_sizes[:num_outputs]
-
-        # Compute offsets if not provided properly (GPU operations)
-        if len(m_offsets_gpu) <= len(valid_indices_limited):
-            valid_offsets_limited = torch.cumsum(
-                torch.cat(
-                    [
-                        torch.tensor([0], device=m_sizes_gpu.device),
-                        valid_sizes_limited[:-1],
-                    ]
-                ),
-                dim=0,
-            )
-        else:
-            valid_offsets_limited = m_offsets_gpu[valid_indices_limited]
-
-        # Single batch CPU transfer for reconstruction
-        valid_sizes_cpu = valid_sizes_limited.cpu().tolist()
-        valid_offsets_cpu = valid_offsets_limited.cpu().tolist()
-
-        # Reconstruct output using pre-transferred data
-        for i, (size, offset) in enumerate(zip(valid_sizes_cpu, valid_offsets_cpu)):
-            if i < len(final_outputs):
-                output[offset : offset + size] = final_outputs[i]
-
-        return output
-
-    @staticmethod
-    def is_available() -> bool:
-        """Check if CUTLASS is available on the current system."""
-        return HAS_CUTLASS

From 211e75a807e2ec804efbc0d02c5b4a75e2b514df Mon Sep 17 00:00:00 2001
From: lessw2020 <lessw@etrillium.com>
Date: Sat, 21 Jun 2025 23:13:34 -0700
Subject: [PATCH 29/34] standalone still not working...

---
 .../deepseek_v3/cutlass_grouped_gemm.py       | 138 +++++++++++-------
 1 file changed, 89 insertions(+), 49 deletions(-)

diff --git a/torchtitan/experiments/deepseek_v3/cutlass_grouped_gemm.py b/torchtitan/experiments/deepseek_v3/cutlass_grouped_gemm.py
index a5031f7fb..e6a956728 100644
--- a/torchtitan/experiments/deepseek_v3/cutlass_grouped_gemm.py
+++ b/torchtitan/experiments/deepseek_v3/cutlass_grouped_gemm.py
@@ -42,10 +42,27 @@
   - hidden: torch.Size([12288, 1408])
   - down_weight (pre-transposed): torch.Size([1408, 2048])
 
-
-Error:
-Error using cutlass strategy: The expanded size of the tensor (1408) must match the existing size (2048) at non-singleton dimension 1.  Target sizes: [12288, 1408].  Tensor sizes: [12288, 2048]
-
+[DEBUG] Gate/Up projection expert 0 (optimized - no transpose)
+  - expert_tokens: torch.Size([12288, 2048])
+  - gate_weight (pre-transposed): torch.Size([1408, 2048])
+  - up_weight (pre-transposed): torch.Size([1408, 2048])
+Error using cutlass strategy: Dimension mismatch: input has 2048 features but weight expects 1408. expert_tokens: torch.Size([12288, 2048]), gate_weight: torch.Size([1408, 2048])
+[DEBUG] Gate/Up projection expert 0 (optimized - no transpose)
+  - expert_tokens: torch.Size([12288, 2048])
+  - gate_weight (pre-transposed): torch.Size([1408, 2048])
+  - up_weight (pre-transposed): torch.Size([1408, 2048])
+Error using cutlass strategy: Dimension mismatch: input has 2048 features but weight expects 1408. expert_tokens: torch.Size([12288, 2048]), gate_weight: torch.Size([1408, 2048])
+[DEBUG] Gate/Up projection expert 0 (optimized - no transpose)
+  - expert_tokens: torch.Size([12288, 2048])
+  - gate_weight (pre-transposed): torch.Size([1408, 2048])
+  - up_weight (pre-transposed): torch.Size([1408, 2048])
+Error using cutlass strategy: Dimension mismatch: input has 2048 features but weight expects 1408. expert_tokens: torch.Size([12288, 2048]), gate_weight: torch.Size([1408, 2048])
+[DEBUG] Gate/Up projection expert 0 (optimized - no transpose)
+  - expert_tokens: torch.Size([12288, 2048])
+  - gate_weight (pre-transposed): torch.Size([1408, 2048])
+  - up_weight (pre-transposed): torch.Size([1408, 2048])
+Error using cutlass strategy: Dimension mismatch: input has 2048 features but weight expects 1408. expert_tokens: torch.Size([12288, 2048]), gate_weight: torch.Size([1408, 2048])
+[rank1]: Traceback (most recent call last):
 """
 
 # Disable file caching while keeping in-memory cache available, defaults to False.
@@ -83,7 +100,14 @@
 
 
 class CUTLASSGroupedGemmStrategy(GroupGEMMStrategy):
-    # Constants
+    """
+    Strategy using CUTLASS GroupedGemmKernel for group GEMM operations on Blackwell architecture.
+
+    Optimized version with pre-transposed weights to eliminate runtime transpose operations.
+    Based on the working "_prev" version with weight pre-transposition optimization.
+    """
+
+    # Constants (same as before)
     SUPPORTED_CLUSTER_SHAPES = [
         (1, 1),
         (1, 2),
@@ -176,44 +200,47 @@ def arrange_expert_weights(self, all_weights, submod_name, module):
         """
         Store weights in stacked format with pre-transposition for optimal GEMM performance.
 
-        This eliminates the need for runtime transpose operations.
+        This eliminates the need for runtime transpose operations by storing weights
+        in the format expected by CUTLASS operations.
 
-        Original PyTorch weight shapes:
-        - gate_proj_weight: [intermediate_size, hidden_size]
-        - up_proj_weight: [intermediate_size, hidden_size]
-        - down_proj_weight: [hidden_size, intermediate_size]
-
-        Pre-transposed shapes for direct GEMM usage:
-        - gate_proj_weight: [hidden_size, intermediate_size] (transposed)
-        - up_proj_weight: [hidden_size, intermediate_size] (transposed)
-        - down_proj_weight: [intermediate_size, hidden_size] (transposed)
+        Based on debug output analysis:
+        - gate_proj_weight: [intermediate_size, hidden_size] → [hidden_size, intermediate_size]
+        - up_proj_weight: [intermediate_size, hidden_size] → [hidden_size, intermediate_size]
+        - down_proj_weight: [hidden_size, intermediate_size] → [intermediate_size, hidden_size]
         """
         print(f"[arrange_expert_weights] Processing {submod_name}")
 
-        # Determine if this weight needs transposition based on submodule name
-        needs_transpose = submod_name in ["gate_proj_weight", "up_proj_weight"]
-
+        # Pre-transpose weights based on their usage pattern from debug output
         transposed_weights = []
         for i, weight in enumerate(all_weights):
             original_shape = weight.shape
 
-            if needs_transpose:
-                # Transpose gate/up weights: [intermediate_size, hidden_size] -> [hidden_size, intermediate_size]
+            if submod_name in ["gate_proj_weight", "up_proj_weight"]:
+                # For gate/up: transpose [intermediate_size, hidden_size] → [hidden_size, intermediate_size]
+                # This matches the debug output: "gate_weight (after .t()): torch.Size([2048, 1408])"
+                # Original would be [1408, 2048], transposed becomes [2048, 1408]
                 transposed_weight = weight.t().contiguous()
                 print(
-                    f"[arrange_expert_weights] {submod_name} expert {i}: {original_shape} -> {transposed_weight.shape} (transposed)"
+                    f"[arrange_expert_weights] {submod_name} expert {i}: {original_shape} → {transposed_weight.shape} (pre-transposed)"
                 )
-            else:
-                # Keep down weights as-is for now, will transpose during stacking
-                # down_proj_weight: [hidden_size, intermediate_size] -> [intermediate_size, hidden_size]
+            elif submod_name == "down_proj_weight":
+                # For down: transpose [hidden_size, intermediate_size] → [intermediate_size, hidden_size]
+                # This matches the debug output: "down_weight (after .t()): torch.Size([1408, 2048])"
+                # Original would be [2048, 1408], transposed becomes [1408, 2048]
                 transposed_weight = weight.t().contiguous()
                 print(
-                    f"[arrange_expert_weights] {submod_name} expert {i}: {original_shape} -> {transposed_weight.shape} (transposed)"
+                    f"[arrange_expert_weights] {submod_name} expert {i}: {original_shape} → {transposed_weight.shape} (pre-transposed)"
+                )
+            else:
+                # Unknown weight type, keep as-is
+                transposed_weight = weight.contiguous()
+                print(
+                    f"[arrange_expert_weights] {submod_name} expert {i}: {original_shape} (no transpose)"
                 )
 
             transposed_weights.append(transposed_weight)
 
-        # Stack all transposed weights
+        # Stack all pre-transposed weights
         stacked = torch.stack(transposed_weights)
         print(
             f"[arrange_expert_weights] {submod_name} final stacked shape: {stacked.shape}"
@@ -240,10 +267,12 @@ def execute(self, contig_tokens, m_sizes, m_offsets, module):
         weights = self._get_weights(module)
         device = contig_tokens.device
 
-        # Prepare output tensor using down projection output size
+        # Prepare output tensor - use the correct dimension from pre-transposed down weights
+        # Pre-transposed down weights: [num_experts, intermediate_size, hidden_size]
+        # So output dimension is hidden_size (shape[2])
         output = torch.zeros(
             contig_tokens.shape[0],
-            weights["down"].shape[1],  # hidden_size (after transpose)
+            weights["down"].shape[2],  # hidden_size from pre-transposed down weights
             dtype=self.DTYPE_TORCH,
             device=device,
         )
@@ -252,11 +281,15 @@ def execute(self, contig_tokens, m_sizes, m_offsets, module):
         if not self._has_valid_experts_gpu(m_sizes_gpu):
             return output
 
-        # Execute the three-stage computation with pre-transposed weights
+        # Execute the three-stage computation with pre-transposed weights (NO runtime transpose!)
         gate_outputs, up_outputs = self._execute_projections_gpu(
             contig_tokens,
-            weights["gate"],  # Already transposed to [hidden_size, intermediate_size]
-            weights["up"],  # Already transposed to [hidden_size, intermediate_size]
+            weights[
+                "gate"
+            ],  # Pre-transposed: [num_experts, hidden_size, intermediate_size]
+            weights[
+                "up"
+            ],  # Pre-transposed: [num_experts, hidden_size, intermediate_size]
             m_sizes_gpu,
             m_offsets_gpu,
             device,
@@ -266,7 +299,9 @@ def execute(self, contig_tokens, m_sizes, m_offsets, module):
 
         final_outputs = self._execute_down_projection_gpu(
             hidden_states,
-            weights["down"],  # Already transposed to [intermediate_size, hidden_size]
+            weights[
+                "down"
+            ],  # Pre-transposed: [num_experts, intermediate_size, hidden_size]
             m_sizes_gpu,
             device,
         )
@@ -298,19 +333,19 @@ def _get_weights(self, module):
         return {
             "gate": module.get_parameter(
                 "gate_proj_weight"
-            ),  # Pre-transposed to [num_experts, hidden_size, intermediate_size]
+            ),  # Pre-transposed: [num_experts, hidden_size, intermediate_size]
             "up": module.get_parameter(
                 "up_proj_weight"
-            ),  # Pre-transposed to [num_experts, hidden_size, intermediate_size]
+            ),  # Pre-transposed: [num_experts, hidden_size, intermediate_size]
             "down": module.get_parameter(
                 "down_proj_weight"
-            ),  # Pre-transposed to [num_experts, intermediate_size, hidden_size]
+            ),  # Pre-transposed: [num_experts, intermediate_size, hidden_size]
         }
 
     def _execute_projections_gpu(
         self, input_tokens, weight1, weight2, m_sizes_gpu, m_offsets_gpu, device
     ):
-        """Execute gate and up projections using pre-transposed weights."""
+        """Execute gate and up projections using pre-transposed weights (NO runtime transpose)."""
         # Find valid experts using GPU operations
         valid_mask = m_sizes_gpu > 0
         valid_indices = torch.nonzero(valid_mask, as_tuple=True)[0]
@@ -375,7 +410,7 @@ def _prepare_gate_up_metadata_gpu(
             zip(valid_indices_cpu, valid_sizes_cpu, valid_offsets_cpu)
         ):
             if size > 0:
-                # Get expert data and PRE-TRANSPOSED weights (no runtime transpose needed!)
+                # Get expert data and PRE-TRANSPOSED weights (NO runtime transpose needed!)
                 expert_tokens = input_tokens[offset : offset + size].contiguous()
                 gate_weight = gate_weights[
                     expert_idx
@@ -384,7 +419,9 @@ def _prepare_gate_up_metadata_gpu(
                     expert_idx
                 ].contiguous()  # Already [hidden_size, intermediate_size]
 
-                print(f"[DEBUG] Gate/Up projection expert {expert_idx} (optimized)")
+                print(
+                    f"[DEBUG] Gate/Up projection expert {expert_idx} (optimized - no transpose)"
+                )
                 print(f"  - expert_tokens: {expert_tokens.shape}")
                 print(f"  - gate_weight (pre-transposed): {gate_weight.shape}")
                 print(f"  - up_weight (pre-transposed): {up_weight.shape}")
@@ -397,7 +434,8 @@ def _prepare_gate_up_metadata_gpu(
                 # Verify dimension compatibility
                 if K != K_weight:
                     raise ValueError(
-                        f"Dimension mismatch: input has {K} features but weight expects {K_weight}"
+                        f"Dimension mismatch: input has {K} features but weight expects {K_weight}. "
+                        f"expert_tokens: {expert_tokens.shape}, gate_weight: {gate_weight.shape}"
                     )
 
                 # Create output tensors
@@ -424,7 +462,7 @@ def _prepare_gate_up_metadata_gpu(
     def _execute_down_projection_gpu(
         self, hidden_states, down_weights, m_sizes_gpu, device
     ):
-        """Execute down projection using pre-transposed weights."""
+        """Execute down projection using pre-transposed weights (NO runtime transpose)."""
         if not hidden_states:
             return []
 
@@ -466,7 +504,9 @@ def _prepare_down_metadata_gpu(
                     expert_idx
                 ].contiguous()  # Already [intermediate_size, hidden_size]
 
-                print(f"[DEBUG] Down projection expert {expert_idx} (optimized)")
+                print(
+                    f"[DEBUG] Down projection expert {expert_idx} (optimized - no transpose)"
+                )
                 print(f"  - hidden: {hidden.shape}")
                 print(f"  - down_weight (pre-transposed): {down_weight.shape}")
 
@@ -478,7 +518,8 @@ def _prepare_down_metadata_gpu(
                 # Verify dimension compatibility
                 if K != K_weight:
                     raise ValueError(
-                        f"Dimension mismatch: hidden has {K} features but weight expects {K_weight}"
+                        f"Dimension mismatch: hidden has {K} features but weight expects {K_weight}. "
+                        f"hidden: {hidden.shape}, down_weight: {down_weight.shape}"
                     )
 
                 # Create output tensor
@@ -508,7 +549,7 @@ def _add_projection_to_metadata(
     ):
         """Add a single projection to the metadata lists (weights are pre-transposed)."""
         M, K = input_tensor.shape
-        K_weight, N = weight_tensor.shape
+        K_weight, N = weight_tensor.shape  # Pre-transposed weights: [K, N]
         L = 1
 
         print(
@@ -542,7 +583,6 @@ def _add_projection_to_metadata(
             ]
         )
 
-    # Rest of the methods remain the same...
     def _execute_grouped_gemm(self, problem_sizes, strides_abc, ptrs_abc, device):
         """Execute the grouped GEMM kernel."""
         num_groups = len(problem_sizes)
@@ -637,16 +677,16 @@ def _get_compiled_kernel(
         return self._compiled_kernels[cache_key]
 
     def _create_initial_tensors(self, problem_shape, device):
-        """Create initial CUTE tensors for kernel compilation."""
+        """Create initial CUTE tensors for kernel compilation with pre-transposed weight format."""
         M, N, K, L = problem_shape
 
-        # Create tensors (weights are already in correct transposed format)
+        # Create tensors for pre-transposed weight format
         tensors = [
-            torch.randn(M, K, dtype=self.DTYPE_TORCH, device=device),  # A
+            torch.randn(M, K, dtype=self.DTYPE_TORCH, device=device),  # A (input)
             torch.randn(
                 K, N, dtype=self.DTYPE_TORCH, device=device
-            ),  # B (pre-transposed)
-            torch.zeros(M, N, dtype=self.DTYPE_TORCH, device=device),  # C
+            ),  # B (pre-transposed weight)
+            torch.zeros(M, N, dtype=self.DTYPE_TORCH, device=device),  # C (output)
         ]
 
         # Convert to MNKL format and create CUTE tensors

From 20a0a92dbe3d4da480ba3c3c4723e774a8e30438 Mon Sep 17 00:00:00 2001
From: lessw2020 <lessw@etrillium.com>
Date: Sat, 21 Jun 2025 23:30:48 -0700
Subject: [PATCH 30/34] pretranspose working

---
 .../deepseek_v3/cutlass_grouped_gemm.py       | 132 ++----------------
 1 file changed, 14 insertions(+), 118 deletions(-)

diff --git a/torchtitan/experiments/deepseek_v3/cutlass_grouped_gemm.py b/torchtitan/experiments/deepseek_v3/cutlass_grouped_gemm.py
index e6a956728..5ebebb8be 100644
--- a/torchtitan/experiments/deepseek_v3/cutlass_grouped_gemm.py
+++ b/torchtitan/experiments/deepseek_v3/cutlass_grouped_gemm.py
@@ -5,64 +5,7 @@
 """
 
 """
-Shapes:
-Kernel compilation successful
-[DEBUG] Down projection expert 0 (optimized)
-  - hidden: torch.Size([12288, 1408])
-  - down_weight (pre-transposed): torch.Size([1408, 2048])
-[DEBUG] Matrix mult (optimized): input torch.Size([12288, 1408]) @ weight torch.Size([1408, 2048]) -> output torch.Size([12288, 2048])
-[DEBUG] Down projection expert 1 (optimized)
-  - hidden: torch.Size([12288, 1408])
-  - down_weight (pre-transposed): torch.Size([1408, 2048])
-[DEBUG] Matrix mult (optimized): input torch.Size([12288, 1408]) @ weight torch.Size([1408, 2048]) -> output torch.Size([12288, 2048])
-[DEBUG] Down projection expert 2 (optimized)
-  - hidden: torch.Size([12288, 1408])
-  - down_weight (pre-transposed): torch.Size([1408, 2048])
-2025-06-21 22:58:46,021 - INFO - cuModuleLoadData 1080453344
-[DEBUG] Matrix mult (optimized): input torch.Size([12288, 1408]) @ weight torch.Size([1408, 2048]) -> output torch.Size([12288, 2048])
-[DEBUG] Down projection expert 3 (optimized)
-  - hidden: torch.Size([12288, 1408])
-  - down_weight (pre-transposed): torch.Size([1408, 2048])
-2025-06-21 22:58:46,021 - INFO - cuModuleGetFunction <CUmodule 0x4040a660> kernel_cutlass_kernel_torchtitanexperimentskernelsblackwellcute_grouped_gemmGroupedGemmKernel_object_at__TiledMMA_ThrLayoutVMNK21111000_PermutationMNK____MMAAtom_ThrID21_ShapeMNK25612816__0
-2025-06-21 22:58:46,021 - INFO - <CUfunction 0x177c2870> <-- cuModuleGetFunction
-[DEBUG] Matrix mult (optimized): input torch.Size([12288, 1408]) @ weight torch.Size([1408, 2048]) -> output torch.Size([12288, 2048])
-[DEBUG] Down projection expert 4 (optimized)
-  - hidden: torch.Size([12288, 1408])
-  - down_weight (pre-transposed): torch.Size([1408, 2048])
-[DEBUG] Matrix mult (optimized): input torch.Size([12288, 1408]) @ weight torch.Size([1408, 2048]) -> output torch.Size([12288, 2048])
-[DEBUG] Down projection expert 5 (optimized)
-  - hidden: torch.Size([12288, 1408])
-  - down_weight (pre-transposed): torch.Size([1408, 2048])
-[DEBUG] Matrix mult (optimized): input torch.Size([12288, 1408]) @ weight torch.Size([1408, 2048]) -> output torch.Size([12288, 2048])
-[DEBUG] Down projection expert 6 (optimized)
-  - hidden: torch.Size([12288, 1408])
-  - down_weight (pre-transposed): torch.Size([1408, 2048])
-[DEBUG] Matrix mult (optimized): input torch.Size([12288, 1408]) @ weight torch.Size([1408, 2048]) -> output torch.Size([12288, 2048])
-[DEBUG] Down projection expert 7 (optimized)
-  - hidden: torch.Size([12288, 1408])
-  - down_weight (pre-transposed): torch.Size([1408, 2048])
-
-[DEBUG] Gate/Up projection expert 0 (optimized - no transpose)
-  - expert_tokens: torch.Size([12288, 2048])
-  - gate_weight (pre-transposed): torch.Size([1408, 2048])
-  - up_weight (pre-transposed): torch.Size([1408, 2048])
-Error using cutlass strategy: Dimension mismatch: input has 2048 features but weight expects 1408. expert_tokens: torch.Size([12288, 2048]), gate_weight: torch.Size([1408, 2048])
-[DEBUG] Gate/Up projection expert 0 (optimized - no transpose)
-  - expert_tokens: torch.Size([12288, 2048])
-  - gate_weight (pre-transposed): torch.Size([1408, 2048])
-  - up_weight (pre-transposed): torch.Size([1408, 2048])
-Error using cutlass strategy: Dimension mismatch: input has 2048 features but weight expects 1408. expert_tokens: torch.Size([12288, 2048]), gate_weight: torch.Size([1408, 2048])
-[DEBUG] Gate/Up projection expert 0 (optimized - no transpose)
-  - expert_tokens: torch.Size([12288, 2048])
-  - gate_weight (pre-transposed): torch.Size([1408, 2048])
-  - up_weight (pre-transposed): torch.Size([1408, 2048])
-Error using cutlass strategy: Dimension mismatch: input has 2048 features but weight expects 1408. expert_tokens: torch.Size([12288, 2048]), gate_weight: torch.Size([1408, 2048])
-[DEBUG] Gate/Up projection expert 0 (optimized - no transpose)
-  - expert_tokens: torch.Size([12288, 2048])
-  - gate_weight (pre-transposed): torch.Size([1408, 2048])
-  - up_weight (pre-transposed): torch.Size([1408, 2048])
-Error using cutlass strategy: Dimension mismatch: input has 2048 features but weight expects 1408. expert_tokens: torch.Size([12288, 2048]), gate_weight: torch.Size([1408, 2048])
-[rank1]: Traceback (most recent call last):
+
 """
 
 # Disable file caching while keeping in-memory cache available, defaults to False.
@@ -107,7 +50,7 @@ class CUTLASSGroupedGemmStrategy(GroupGEMMStrategy):
     Based on the working "_prev" version with weight pre-transposition optimization.
     """
 
-    # Constants (same as before)
+    # Constants
     SUPPORTED_CLUSTER_SHAPES = [
         (1, 1),
         (1, 2),
@@ -203,50 +146,20 @@ def arrange_expert_weights(self, all_weights, submod_name, module):
         This eliminates the need for runtime transpose operations by storing weights
         in the format expected by CUTLASS operations.
 
-        Based on debug output analysis:
-        - gate_proj_weight: [intermediate_size, hidden_size] → [hidden_size, intermediate_size]
-        - up_proj_weight: [intermediate_size, hidden_size] → [hidden_size, intermediate_size]
-        - down_proj_weight: [hidden_size, intermediate_size] → [intermediate_size, hidden_size]
+        Target shapes (to match working version after .t()):
+        - gate_proj_weight: target [2048, 1408] (for expert_tokens [*, 2048] @ weight [2048, 1408])
+        - up_proj_weight: target [2048, 1408] (for expert_tokens [*, 2048] @ weight [2048, 1408])
+        - down_proj_weight: target [1408, 2048] (for hidden [*, 1408] @ weight [1408, 2048])
         """
-        print(f"[arrange_expert_weights] Processing {submod_name}")
-
-        # Pre-transpose weights based on their usage pattern from debug output
+        # Pre-transpose weights to match exactly what working version produces after .t()
         transposed_weights = []
-        for i, weight in enumerate(all_weights):
-            original_shape = weight.shape
-
-            if submod_name in ["gate_proj_weight", "up_proj_weight"]:
-                # For gate/up: transpose [intermediate_size, hidden_size] → [hidden_size, intermediate_size]
-                # This matches the debug output: "gate_weight (after .t()): torch.Size([2048, 1408])"
-                # Original would be [1408, 2048], transposed becomes [2048, 1408]
-                transposed_weight = weight.t().contiguous()
-                print(
-                    f"[arrange_expert_weights] {submod_name} expert {i}: {original_shape} → {transposed_weight.shape} (pre-transposed)"
-                )
-            elif submod_name == "down_proj_weight":
-                # For down: transpose [hidden_size, intermediate_size] → [intermediate_size, hidden_size]
-                # This matches the debug output: "down_weight (after .t()): torch.Size([1408, 2048])"
-                # Original would be [2048, 1408], transposed becomes [1408, 2048]
-                transposed_weight = weight.t().contiguous()
-                print(
-                    f"[arrange_expert_weights] {submod_name} expert {i}: {original_shape} → {transposed_weight.shape} (pre-transposed)"
-                )
-            else:
-                # Unknown weight type, keep as-is
-                transposed_weight = weight.contiguous()
-                print(
-                    f"[arrange_expert_weights] {submod_name} expert {i}: {original_shape} (no transpose)"
-                )
-
+        for weight in all_weights:
+            # Transpose all weight types since they all need it in the working version
+            transposed_weight = weight.t().contiguous()
             transposed_weights.append(transposed_weight)
 
         # Stack all pre-transposed weights
-        stacked = torch.stack(transposed_weights)
-        print(
-            f"[arrange_expert_weights] {submod_name} final stacked shape: {stacked.shape}"
-        )
-
-        return stacked
+        return torch.stack(transposed_weights)
 
     def execute(self, contig_tokens, m_sizes, m_offsets, module):
         """
@@ -268,8 +181,8 @@ def execute(self, contig_tokens, m_sizes, m_offsets, module):
         device = contig_tokens.device
 
         # Prepare output tensor - use the correct dimension from pre-transposed down weights
-        # Pre-transposed down weights: [num_experts, intermediate_size, hidden_size]
-        # So output dimension is hidden_size (shape[2])
+        # Pre-transposed down weights should be: [num_experts, 1408, 2048]
+        # So output dimension is hidden_size = 2048 (shape[2])
         output = torch.zeros(
             contig_tokens.shape[0],
             weights["down"].shape[2],  # hidden_size from pre-transposed down weights
@@ -410,7 +323,7 @@ def _prepare_gate_up_metadata_gpu(
             zip(valid_indices_cpu, valid_sizes_cpu, valid_offsets_cpu)
         ):
             if size > 0:
-                # Get expert data and PRE-TRANSPOSED weights (NO runtime transpose needed!)
+                # Get expert data and PRE-TRANSPOSED weights
                 expert_tokens = input_tokens[offset : offset + size].contiguous()
                 gate_weight = gate_weights[
                     expert_idx
@@ -419,13 +332,6 @@ def _prepare_gate_up_metadata_gpu(
                     expert_idx
                 ].contiguous()  # Already [hidden_size, intermediate_size]
 
-                print(
-                    f"[DEBUG] Gate/Up projection expert {expert_idx} (optimized - no transpose)"
-                )
-                print(f"  - expert_tokens: {expert_tokens.shape}")
-                print(f"  - gate_weight (pre-transposed): {gate_weight.shape}")
-                print(f"  - up_weight (pre-transposed): {up_weight.shape}")
-
                 M, K = expert_tokens.shape  # M = batch_size, K = hidden_size
                 K_weight, N = (
                     gate_weight.shape
@@ -504,12 +410,6 @@ def _prepare_down_metadata_gpu(
                     expert_idx
                 ].contiguous()  # Already [intermediate_size, hidden_size]
 
-                print(
-                    f"[DEBUG] Down projection expert {expert_idx} (optimized - no transpose)"
-                )
-                print(f"  - hidden: {hidden.shape}")
-                print(f"  - down_weight (pre-transposed): {down_weight.shape}")
-
                 M, K = hidden.shape  # M = batch_size, K = intermediate_size
                 K_weight, N = (
                     down_weight.shape
@@ -552,10 +452,6 @@ def _add_projection_to_metadata(
         K_weight, N = weight_tensor.shape  # Pre-transposed weights: [K, N]
         L = 1
 
-        print(
-            f"[DEBUG] Matrix mult (optimized): input {input_tensor.shape} @ weight {weight_tensor.shape} -> output {output_tensor.shape}"
-        )
-
         # Verify dimension compatibility
         if K != K_weight:
             raise ValueError(

From e833196e2559472c0e8c34ace069e54431a3b8d3 Mon Sep 17 00:00:00 2001
From: lessw2020 <lessw@etrillium.com>
Date: Sun, 22 Jun 2025 08:09:57 -0700
Subject: [PATCH 31/34] 3 different versions...one working, 2 with issues

---
 .../deepseek_v3/cutlass_grouped_gemm.py       | 1435 +++++++++++++++--
 .../experiments/deepseek_v3/generate.py       |    2 +-
 2 files changed, 1322 insertions(+), 115 deletions(-)

diff --git a/torchtitan/experiments/deepseek_v3/cutlass_grouped_gemm.py b/torchtitan/experiments/deepseek_v3/cutlass_grouped_gemm.py
index 5ebebb8be..1e4f51960 100644
--- a/torchtitan/experiments/deepseek_v3/cutlass_grouped_gemm.py
+++ b/torchtitan/experiments/deepseek_v3/cutlass_grouped_gemm.py
@@ -15,9 +15,11 @@
 # export CUTE_DSL_FILE_CACHING_CAPACITY=1000
 
 import logging
+from typing import Any, Dict, List, Tuple
 
 import torch
 
+
 try:
     import cuda.bindings.driver as cuda
     import cutlass
@@ -36,21 +38,28 @@
     print(f"✗ CUTLASS import failed: {e}")
     print("CUTLASSGroupedGemmStrategy will not be available")
 
+from torchtitan.experiments.kernels.blackwell.pytorch_cute_converter import (
+    GroupedGemmTensorManager,
+    PyTorchToCuteConverter,
+)
+
 # Import base class - adjust path as needed based on your project structure
 from .group_gemms import GroupGEMMStrategy
 
 logger = logging.getLogger(__name__)
 
 
+# ================== current working version ==================
+
+
 class CUTLASSGroupedGemmStrategy(GroupGEMMStrategy):
     """
     Strategy using CUTLASS GroupedGemmKernel for group GEMM operations on Blackwell architecture.
 
-    Optimized version with pre-transposed weights to eliminate runtime transpose operations.
-    Based on the working "_prev" version with weight pre-transposition optimization.
+    This version eliminates CPU-GPU synchronization by keeping all size/offset computations on GPU.
     """
 
-    # Constants
+    # Constants (same as before)
     SUPPORTED_CLUSTER_SHAPES = [
         (1, 1),
         (1, 2),
@@ -89,6 +98,9 @@ def __init__(
         self.mma_tiler_mn = mma_tiler_mn or self._get_default_mma_tiler()
         self.cluster_shape_mn = cluster_shape_mn or self._get_default_cluster_shape()
 
+        # Validate configurations
+        # self._validate_configurations()
+
         # Initialize kernel and hardware info
         self._initialize_kernel()
         self._initialize_hardware()
@@ -127,6 +139,50 @@ def _initialize_hardware(self):
         torch_stream = torch.cuda.current_stream()
         self.stream = cuda.CUstream(torch_stream.cuda_stream)
 
+    def _validate_configurations(self):
+        """Validate configurations for Blackwell."""
+        self._validate_mma_tiler()
+        self._validate_cluster_shape()
+        self._validate_2cta_constraints()
+
+    def _validate_mma_tiler(self):
+        """Validate MMA tiler configuration."""
+        m_size, n_size = self.mma_tiler_mn
+
+        valid_m_sizes = (
+            self.DUAL_CTA_M_SIZES if self.use_2cta_instrs else self.SINGLE_CTA_M_SIZES
+        )
+        mode_name = "2 CTA" if self.use_2cta_instrs else "single CTA"
+
+        if m_size not in valid_m_sizes:
+            raise ValueError(
+                f"For {mode_name} mode on Blackwell, MMA tiler M must be in {valid_m_sizes}, got {m_size}"
+            )
+
+        if n_size not in self.N_SIZE_RANGE:
+            raise ValueError(
+                f"MMA tiler N must be in range [32, 256] with step 32, got {n_size}"
+            )
+
+    def _validate_cluster_shape(self):
+        """Validate cluster shape configuration."""
+        if self.cluster_shape_mn not in self.SUPPORTED_CLUSTER_SHAPES:
+            raise ValueError(
+                f"Cluster shape {self.cluster_shape_mn} not supported on Blackwell. "
+                f"Valid cluster shapes are: {self.SUPPORTED_CLUSTER_SHAPES}"
+            )
+
+    def _validate_2cta_constraints(self):
+        """Validate 2 CTA specific constraints."""
+        if self.use_2cta_instrs and self.cluster_shape_mn[0] % 2 != 0:
+            valid_2cta_shapes = [
+                shape for shape in self.SUPPORTED_CLUSTER_SHAPES if shape[0] % 2 == 0
+            ]
+            raise ValueError(
+                f"For 2 CTA mode, cluster shape M must be even, got {self.cluster_shape_mn[0]}. "
+                f"Valid 2 CTA cluster shapes: {valid_2cta_shapes}"
+            )
+
     def _log_initialization(self):
         """Log initialization information."""
         cluster_size = self.cluster_shape_mn[0] * self.cluster_shape_mn[1]
@@ -135,74 +191,52 @@ def _log_initialization(self):
         print(f"  - MMA tiler (M, N): {self.mma_tiler_mn}")
         print(f"  - Cluster shape (M, N): {self.cluster_shape_mn}")
         print(f"  - Cluster size: {cluster_size}")
-        print(f"  - Weight optimization: Pre-transposed (no runtime transpose)")
         if cluster_size > 1:
             print(f"  - Using multi-CTA parallelism")
 
     def arrange_expert_weights(self, all_weights, submod_name, module):
-        """
-        Store weights in stacked format with pre-transposition for optimal GEMM performance.
-
-        This eliminates the need for runtime transpose operations by storing weights
-        in the format expected by CUTLASS operations.
-
-        Target shapes (to match working version after .t()):
-        - gate_proj_weight: target [2048, 1408] (for expert_tokens [*, 2048] @ weight [2048, 1408])
-        - up_proj_weight: target [2048, 1408] (for expert_tokens [*, 2048] @ weight [2048, 1408])
-        - down_proj_weight: target [1408, 2048] (for hidden [*, 1408] @ weight [1408, 2048])
-        """
-        # Pre-transpose weights to match exactly what working version produces after .t()
-        transposed_weights = []
-        for weight in all_weights:
-            # Transpose all weight types since they all need it in the working version
-            transposed_weight = weight.t().contiguous()
-            transposed_weights.append(transposed_weight)
-
-        # Stack all pre-transposed weights
-        return torch.stack(transposed_weights)
+        """Store weights in stacked format."""
+        return torch.stack(all_weights)
 
     def execute(self, contig_tokens, m_sizes, m_offsets, module):
         """
-        Execute using CUTLASS grouped GEMM kernel with pre-transposed weights.
+        Execute using CUTLASS grouped GEMM kernel - GPU-only version.
 
         Args:
             contig_tokens: Input tokens arranged contiguously by expert
             m_sizes: Tensor of expert sizes (GPU tensor to avoid sync)
             m_offsets: Tensor of expert offsets (GPU tensor to avoid sync)
-            module: MoE module containing pre-transposed weights
+            module: MoE module containing weights
         """
         # Convert to GPU tensors if needed (avoid CPU-GPU sync)
         m_sizes_gpu, m_offsets_gpu = self._ensure_gpu_tensors(
             m_sizes, m_offsets, contig_tokens.device
         )
 
-        # Get pre-transposed weights and device
+        # Validate inputs
+        # self._validate_inputs(contig_tokens, m_sizes_gpu, module)
+
+        # Get weights and device
         weights = self._get_weights(module)
         device = contig_tokens.device
 
-        # Prepare output tensor - use the correct dimension from pre-transposed down weights
-        # Pre-transposed down weights should be: [num_experts, 1408, 2048]
-        # So output dimension is hidden_size = 2048 (shape[2])
+        # Prepare output tensor
         output = torch.zeros(
             contig_tokens.shape[0],
-            weights["down"].shape[2],  # hidden_size from pre-transposed down weights
+            weights["gate"].shape[2],
             dtype=self.DTYPE_TORCH,
             device=device,
         )
 
-        # Check for valid experts using GPU operations (minimal sync)
+        # Check for valid experts using GPU operations (no sync)
         if not self._has_valid_experts_gpu(m_sizes_gpu):
             return output
 
-        # Execute the three-stage computation with pre-transposed weights (NO runtime transpose!)
+        # Execute the three-stage computation using GPU-only operations
         gate_outputs, up_outputs = self._execute_projections_gpu(
             contig_tokens,
-            weights[
-                "gate"
-            ],  # Pre-transposed: [num_experts, hidden_size, intermediate_size]
-            weights[
-                "up"
-            ],  # Pre-transposed: [num_experts, hidden_size, intermediate_size]
+            weights["gate"],
+            weights["up"],
             m_sizes_gpu,
             m_offsets_gpu,
             device,
@@ -211,12 +245,7 @@ def execute(self, contig_tokens, m_sizes, m_offsets, module):
         hidden_states = self._apply_activation_and_combine(gate_outputs, up_outputs)
 
         final_outputs = self._execute_down_projection_gpu(
-            hidden_states,
-            weights[
-                "down"
-            ],  # Pre-transposed: [num_experts, intermediate_size, hidden_size]
-            m_sizes_gpu,
-            device,
+            hidden_states, weights["down"], m_sizes_gpu, device
         )
 
         return self._reconstruct_output_gpu(
@@ -238,27 +267,40 @@ def _ensure_gpu_tensors(self, m_sizes, m_offsets, device):
         return m_sizes_gpu, m_offsets_gpu
 
     def _has_valid_experts_gpu(self, m_sizes_gpu):
-        """Check if any experts have tokens using GPU operations (minimal sync)."""
-        return torch.any(m_sizes_gpu > 0).item()
+        """Check if any experts have tokens using GPU operations (no sync)."""
+        return torch.any(
+            m_sizes_gpu > 0
+        ).item()  # Single sync here is unavoidable for control flow
+
+    def _validate_inputs(self, contig_tokens, m_sizes_gpu, module):
+        """Validate input parameters."""
+        if contig_tokens.dtype != self.DTYPE_TORCH:
+            raise ValueError(
+                f"Expected input dtype {self.DTYPE_TORCH}, got {contig_tokens.dtype}"
+            )
+
+        if len(contig_tokens.shape) != 2:
+            raise ValueError(
+                f"Expected 2D input tensor, got shape {contig_tokens.shape}"
+            )
+
+        required_params = ["gate_proj_weight", "up_proj_weight", "down_proj_weight"]
+        for param in required_params:
+            if not hasattr(module, param) or module.get_parameter(param) is None:
+                raise ValueError(f"Module missing required parameter: {param}")
 
     def _get_weights(self, module):
-        """Extract pre-transposed weight tensors from module."""
+        """Extract and return weight tensors from module."""
         return {
-            "gate": module.get_parameter(
-                "gate_proj_weight"
-            ),  # Pre-transposed: [num_experts, hidden_size, intermediate_size]
-            "up": module.get_parameter(
-                "up_proj_weight"
-            ),  # Pre-transposed: [num_experts, hidden_size, intermediate_size]
-            "down": module.get_parameter(
-                "down_proj_weight"
-            ),  # Pre-transposed: [num_experts, intermediate_size, hidden_size]
+            "gate": module.get_parameter("gate_proj_weight"),
+            "up": module.get_parameter("up_proj_weight"),
+            "down": module.get_parameter("down_proj_weight"),
         }
 
     def _execute_projections_gpu(
         self, input_tokens, weight1, weight2, m_sizes_gpu, m_offsets_gpu, device
     ):
-        """Execute gate and up projections using pre-transposed weights (NO runtime transpose)."""
+        """Execute gate and up projections using GPU-only operations."""
         # Find valid experts using GPU operations
         valid_mask = m_sizes_gpu > 0
         valid_indices = torch.nonzero(valid_mask, as_tuple=True)[0]
@@ -266,12 +308,12 @@ def _execute_projections_gpu(
         if len(valid_indices) == 0:
             return [], []
 
-        # Prepare metadata with pre-transposed weights
+        # Prepare metadata in batch using GPU operations
         problem_sizes, strides_abc, ptrs_abc, gate_outputs, up_outputs = (
             self._prepare_gate_up_metadata_gpu(
                 input_tokens,
-                weight1,  # Pre-transposed gate weights
-                weight2,  # Pre-transposed up weights
+                weight1,
+                weight2,
                 m_sizes_gpu,
                 m_offsets_gpu,
                 valid_indices,
@@ -297,7 +339,7 @@ def _prepare_gate_up_metadata_gpu(
         valid_indices,
         device,
     ):
-        """Prepare metadata for gate and up projections using pre-transposed weights."""
+        """Prepare metadata for gate and up projections"""
         problem_sizes = []
         strides_abc = []
         ptrs_abc = []
@@ -314,7 +356,7 @@ def _prepare_gate_up_metadata_gpu(
             )
         )
 
-        # Convert to Python for iteration (unavoidable for metadata preparation)
+        # Convert to Python for iteration (unavoidable in this test for metadata preparation)
         valid_sizes_cpu = valid_sizes.cpu().tolist()
         valid_offsets_cpu = valid_offsets.cpu().tolist()
         valid_indices_cpu = valid_indices.cpu().tolist()
@@ -323,26 +365,14 @@ def _prepare_gate_up_metadata_gpu(
             zip(valid_indices_cpu, valid_sizes_cpu, valid_offsets_cpu)
         ):
             if size > 0:
-                # Get expert data and PRE-TRANSPOSED weights
+                # Get expert data
                 expert_tokens = input_tokens[offset : offset + size].contiguous()
-                gate_weight = gate_weights[
-                    expert_idx
-                ].contiguous()  # Already [hidden_size, intermediate_size]
-                up_weight = up_weights[
-                    expert_idx
-                ].contiguous()  # Already [hidden_size, intermediate_size]
-
-                M, K = expert_tokens.shape  # M = batch_size, K = hidden_size
-                K_weight, N = (
-                    gate_weight.shape
-                )  # K_weight = hidden_size, N = intermediate_size
+                gate_weight = gate_weights[expert_idx].contiguous()
+                up_weight = up_weights[expert_idx].contiguous()
 
-                # Verify dimension compatibility
-                if K != K_weight:
-                    raise ValueError(
-                        f"Dimension mismatch: input has {K} features but weight expects {K_weight}. "
-                        f"expert_tokens: {expert_tokens.shape}, gate_weight: {gate_weight.shape}"
-                    )
+                M, K = expert_tokens.shape
+                N = gate_weight.shape[0]
+                L = 1
 
                 # Create output tensors
                 gate_output = torch.empty(M, N, dtype=self.DTYPE_TORCH, device=device)
@@ -368,7 +398,7 @@ def _prepare_gate_up_metadata_gpu(
     def _execute_down_projection_gpu(
         self, hidden_states, down_weights, m_sizes_gpu, device
     ):
-        """Execute down projection using pre-transposed weights (NO runtime transpose)."""
+        """Execute down projection using GPU operations."""
         if not hidden_states:
             return []
 
@@ -394,7 +424,7 @@ def _execute_down_projection_gpu(
     def _prepare_down_metadata_gpu(
         self, hidden_states, down_weights, valid_indices, device
     ):
-        """Prepare metadata for down projection using pre-transposed weights."""
+        """Prepare metadata for down projection using GPU operations."""
         problem_sizes = []
         strides_abc = []
         ptrs_abc = []
@@ -406,21 +436,10 @@ def _prepare_down_metadata_gpu(
         for i, expert_idx in enumerate(valid_indices_cpu):
             if i < len(hidden_states):
                 hidden = hidden_states[i]
-                down_weight = down_weights[
-                    expert_idx
-                ].contiguous()  # Already [intermediate_size, hidden_size]
-
-                M, K = hidden.shape  # M = batch_size, K = intermediate_size
-                K_weight, N = (
-                    down_weight.shape
-                )  # K_weight = intermediate_size, N = hidden_size
+                down_weight = down_weights[expert_idx].contiguous()
 
-                # Verify dimension compatibility
-                if K != K_weight:
-                    raise ValueError(
-                        f"Dimension mismatch: hidden has {K} features but weight expects {K_weight}. "
-                        f"hidden: {hidden.shape}, down_weight: {down_weight.shape}"
-                    )
+                M, K = hidden.shape
+                N = down_weight.shape[0]
 
                 # Create output tensor
                 down_output = torch.empty(M, N, dtype=self.DTYPE_TORCH, device=device)
@@ -447,17 +466,11 @@ def _add_projection_to_metadata(
         strides_abc,
         ptrs_abc,
     ):
-        """Add a single projection to the metadata lists (weights are pre-transposed)."""
+        """Add a single projection to the metadata lists."""
         M, K = input_tensor.shape
-        K_weight, N = weight_tensor.shape  # Pre-transposed weights: [K, N]
+        N = weight_tensor.shape[0]
         L = 1
 
-        # Verify dimension compatibility
-        if K != K_weight:
-            raise ValueError(
-                f"Matrix multiplication dimension mismatch: {K} != {K_weight}"
-            )
-
         # Convert to MNKL format
         input_mnkl = input_tensor.unsqueeze(-1).contiguous()
         weight_mnkl = weight_tensor.unsqueeze(-1).contiguous()
@@ -552,8 +565,7 @@ def _get_compiled_kernel(
 
         if cache_key not in self._compiled_kernels:
             print(
-                f"Compiling CUTLASS grouped GEMM kernel: {num_groups} groups, "
-                f"2CTA={self.use_2cta_instrs}, cluster={self.cluster_shape_mn}"
+                f"Compiling CUTLASS grouped GEMM kernel: {num_groups} groups, 2CTA={self.use_2cta_instrs}, cluster={self.cluster_shape_mn}"
             )
 
             self._compiled_kernels[cache_key] = cute.compile(
@@ -573,16 +585,14 @@ def _get_compiled_kernel(
         return self._compiled_kernels[cache_key]
 
     def _create_initial_tensors(self, problem_shape, device):
-        """Create initial CUTE tensors for kernel compilation with pre-transposed weight format."""
+        """Create initial CUTE tensors for kernel compilation."""
         M, N, K, L = problem_shape
 
-        # Create tensors for pre-transposed weight format
+        # Create tensors
         tensors = [
-            torch.randn(M, K, dtype=self.DTYPE_TORCH, device=device),  # A (input)
-            torch.randn(
-                K, N, dtype=self.DTYPE_TORCH, device=device
-            ),  # B (pre-transposed weight)
-            torch.zeros(M, N, dtype=self.DTYPE_TORCH, device=device),  # C (output)
+            torch.randn(M, K, dtype=self.DTYPE_TORCH, device=device),  # A
+            torch.randn(N, K, dtype=self.DTYPE_TORCH, device=device),  # B
+            torch.zeros(M, N, dtype=self.DTYPE_TORCH, device=device),  # C
         ]
 
         # Convert to MNKL format and create CUTE tensors
@@ -608,6 +618,7 @@ def _get_tensormap_buffer(self, device):
             self._tensormap_buffers[device] = from_dlpack(
                 tensormap_tensor, assumed_align=self.ALIGNMENT
             )
+
         return self._tensormap_buffers[device]
 
     def _compute_total_clusters(self, problem_sizes):
@@ -673,3 +684,1199 @@ def _reconstruct_output_gpu(
     @staticmethod
     def is_available() -> bool:
         return HAS_CUTLASS
+
+
+# ================== end, current working version ==================
+
+
+class CUTLASSGroupedGemmStrategy_pre_transpose(GroupGEMMStrategy):
+    """
+    Strategy using CUTLASS GroupedGemmKernel for group GEMM operations on Blackwell architecture.
+
+    Optimized version with pre-transposed weights - based exactly on working "_prev" version
+    with ONLY the transpose optimization added.
+    """
+
+    # Constants (same as before)
+    SUPPORTED_CLUSTER_SHAPES = [
+        (1, 1),
+        (1, 2),
+        (1, 4),
+        (2, 1),
+        (2, 2),
+        (2, 4),
+        (4, 1),
+        (4, 2),
+        (4, 4),
+    ]
+
+    SINGLE_CTA_M_SIZES = [128, 64]
+    DUAL_CTA_M_SIZES = [256, 128]
+    N_SIZE_RANGE = range(32, 257, 32)  # 32 - 256, step 32
+
+    DTYPE_TORCH = torch.bfloat16
+    DTYPE_CUTLASS = cutlass.BFloat16
+    ACC_DTYPE = cutlass.Float32
+    ALIGNMENT = 16
+    TENSORMAP_COUNT = 3
+    TENSORMAP_BYTES = 128
+
+    def __init__(
+        self,
+        custom_activation,
+        use_2cta_instrs=True,
+        mma_tiler_mn=(256, 128),
+        cluster_shape_mn=(4, 4),
+    ):
+        """Initialize the CUTLASS grouped GEMM strategy for Blackwell architecture."""
+        super().__init__(custom_activation)
+        self.use_2cta_instrs = use_2cta_instrs
+
+        # Set configuration
+        self.mma_tiler_mn = mma_tiler_mn or self._get_default_mma_tiler()
+        self.cluster_shape_mn = cluster_shape_mn or self._get_default_cluster_shape()
+
+        # Initialize kernel and hardware info
+        self._initialize_kernel()
+        self._initialize_hardware()
+
+        # Initialize caches
+        self._compiled_kernels = {}
+        self._tensormap_buffers = {}
+
+        self._log_initialization()
+
+    def _get_default_mma_tiler(self):
+        """Get default MMA tiler configuration based on CTA mode."""
+        return (256, 128) if self.use_2cta_instrs else (128, 128)
+
+    def _get_default_cluster_shape(self):
+        """Get default cluster shape based on CTA mode."""
+        return (2, 2) if self.use_2cta_instrs else (1, 1)
+
+    def _initialize_kernel(self):
+        """Initialize the CUTLASS grouped GEMM kernel."""
+        self.grouped_gemm = GroupedGemmKernel(
+            acc_dtype=self.ACC_DTYPE,
+            use_2cta_instrs=self.use_2cta_instrs,
+            mma_tiler_mn=self.mma_tiler_mn,
+            cluster_shape_mn=self.cluster_shape_mn,
+            tensormap_update_mode=utils.TensorMapUpdateMode.SMEM,
+        )
+
+    def _initialize_hardware(self):
+        """Initialize hardware information and stream."""
+        self.hardware_info = utils.HardwareInfo()
+        self.max_active_clusters = self.hardware_info.get_max_active_clusters(
+            self.cluster_shape_mn[0] * self.cluster_shape_mn[1]
+        )
+
+        torch_stream = torch.cuda.current_stream()
+        self.stream = cuda.CUstream(torch_stream.cuda_stream)
+
+    def _log_initialization(self):
+        """Log initialization information."""
+        cluster_size = self.cluster_shape_mn[0] * self.cluster_shape_mn[1]
+        print(f"Initialized CUTLASSGroupedGemmStrategy for Blackwell with:")
+        print(f"  - 2 CTA instructions: {self.use_2cta_instrs}")
+        print(f"  - MMA tiler (M, N): {self.mma_tiler_mn}")
+        print(f"  - Cluster shape (M, N): {self.cluster_shape_mn}")
+        print(f"  - Cluster size: {cluster_size}")
+        print(f"  - Weight optimization: Pre-transposed (no runtime transpose)")
+        if cluster_size > 1:
+            print(f"  - Using multi-CTA parallelism")
+
+    def arrange_expert_weights(self, all_weights, submod_name, module):
+        """Store weights in stacked format with pre-transposition optimization."""
+        # Pre-transpose weights to eliminate runtime .t() calls
+        transposed_weights = []
+        for weight in all_weights:
+            transposed_weights.append(weight.t().contiguous())
+        return torch.stack(transposed_weights)
+
+    def execute(self, contig_tokens, m_sizes, m_offsets, module):
+        """
+        Execute using CUTLASS grouped GEMM kernel - GPU-only version.
+        EXACT copy of working version except weights are pre-transposed.
+
+        Args:
+            contig_tokens: Input tokens arranged contiguously by expert
+            m_sizes: Tensor of expert sizes (GPU tensor to avoid sync)
+            m_offsets: Tensor of expert offsets (GPU tensor to avoid sync)
+            module: MoE module containing weights
+        """
+        # Convert to GPU tensors if needed (avoid CPU-GPU sync)
+        m_sizes_gpu, m_offsets_gpu = self._ensure_gpu_tensors(
+            m_sizes, m_offsets, contig_tokens.device
+        )
+
+        # Get weights and device
+        weights = self._get_weights(module)
+        device = contig_tokens.device
+
+        # Prepare output tensor - adjust for pre-transposed weights
+        # Final output size should be hidden_size (2048)
+        # In pre-transposed down weights: [num_experts, intermediate_size, hidden_size]
+        # So shape[2] gives us hidden_size
+        output = torch.zeros(
+            contig_tokens.shape[0],
+            weights["down"].shape[2],  # hidden_size from pre-transposed down weights
+            dtype=self.DTYPE_TORCH,
+            device=device,
+        )
+
+        # Check for valid experts using GPU operations (no sync)
+        if not self._has_valid_experts_gpu(m_sizes_gpu):
+            return output
+
+        # Execute the three-stage computation using GPU-only operations
+        gate_outputs, up_outputs = self._execute_projections_gpu(
+            contig_tokens,
+            weights["gate"],
+            weights["up"],
+            m_sizes_gpu,
+            m_offsets_gpu,
+            device,
+        )
+
+        hidden_states = self._apply_activation_and_combine(gate_outputs, up_outputs)
+
+        final_outputs = self._execute_down_projection_gpu(
+            hidden_states, weights["down"], m_sizes_gpu, device
+        )
+
+        return self._reconstruct_output_gpu(
+            final_outputs, m_sizes_gpu, m_offsets_gpu, output
+        )
+
+    def _ensure_gpu_tensors(self, m_sizes, m_offsets, device):
+        """Ensure sizes and offsets are GPU tensors to avoid CPU-GPU sync."""
+        if not isinstance(m_sizes, torch.Tensor):
+            m_sizes_gpu = torch.tensor(m_sizes, dtype=torch.int32, device=device)
+        else:
+            m_sizes_gpu = m_sizes.to(device=device, dtype=torch.int32)
+
+        if not isinstance(m_offsets, torch.Tensor):
+            m_offsets_gpu = torch.tensor(m_offsets, dtype=torch.int32, device=device)
+        else:
+            m_offsets_gpu = m_offsets.to(device=device, dtype=torch.int32)
+
+        return m_sizes_gpu, m_offsets_gpu
+
+    def _has_valid_experts_gpu(self, m_sizes_gpu):
+        """Check if any experts have tokens using GPU operations (no sync)."""
+        return torch.any(
+            m_sizes_gpu > 0
+        ).item()  # Single sync here is unavoidable for control flow
+
+    def _get_weights(self, module):
+        """Extract and return weight tensors from module."""
+        return {
+            "gate": module.get_parameter("gate_proj_weight"),
+            "up": module.get_parameter("up_proj_weight"),
+            "down": module.get_parameter("down_proj_weight"),
+        }
+
+    def _execute_projections_gpu(
+        self, input_tokens, weight1, weight2, m_sizes_gpu, m_offsets_gpu, device
+    ):
+        """Execute gate and up projections using GPU-only operations."""
+        # Find valid experts using GPU operations
+        valid_mask = m_sizes_gpu > 0
+        valid_indices = torch.nonzero(valid_mask, as_tuple=True)[0]
+
+        if len(valid_indices) == 0:
+            return [], []
+
+        # Prepare metadata in batch using GPU operations
+        problem_sizes, strides_abc, ptrs_abc, gate_outputs, up_outputs = (
+            self._prepare_gate_up_metadata_gpu(
+                input_tokens,
+                weight1,
+                weight2,
+                m_sizes_gpu,
+                m_offsets_gpu,
+                valid_indices,
+                device,
+            )
+        )
+
+        if len(problem_sizes) == 0:
+            return [], []
+
+        # Execute grouped GEMM
+        self._execute_grouped_gemm(problem_sizes, strides_abc, ptrs_abc, device)
+
+        return gate_outputs, up_outputs
+
+    def _prepare_gate_up_metadata_gpu(
+        self,
+        input_tokens,
+        gate_weights,
+        up_weights,
+        m_sizes_gpu,
+        m_offsets_gpu,
+        valid_indices,
+        device,
+    ):
+        """Prepare metadata for gate and up projections"""
+        problem_sizes = []
+        strides_abc = []
+        ptrs_abc = []
+        gate_outputs = []
+        up_outputs = []
+
+        # Extract valid sizes and offsets (minimal sync - only for valid experts)
+        valid_sizes = m_sizes_gpu[valid_indices]
+        valid_offsets = (
+            m_offsets_gpu[valid_indices]
+            if len(m_offsets_gpu) > len(valid_indices)
+            else torch.cumsum(
+                torch.cat([torch.tensor([0], device=device), valid_sizes[:-1]]), dim=0
+            )
+        )
+
+        # Convert to Python for iteration (unavoidable in this test for metadata preparation)
+        valid_sizes_cpu = valid_sizes.cpu().tolist()
+        valid_offsets_cpu = valid_offsets.cpu().tolist()
+        valid_indices_cpu = valid_indices.cpu().tolist()
+
+        for i, (expert_idx, size, offset) in enumerate(
+            zip(valid_indices_cpu, valid_sizes_cpu, valid_offsets_cpu)
+        ):
+            if size > 0:
+                # Get expert data - weights are now PRE-TRANSPOSED, no .t() needed!
+                expert_tokens = input_tokens[offset : offset + size].contiguous()
+                gate_weight = gate_weights[
+                    expert_idx
+                ].contiguous()  # Already transposed
+                up_weight = up_weights[expert_idx].contiguous()  # Already transposed
+
+                M, K = expert_tokens.shape
+                # Pre-transposed gate/up weights: [hidden_size, intermediate_size]
+                # So gate_weight.shape = [hidden_size, intermediate_size] = [2048, 1408]
+                # We want N = intermediate_size for the output
+                K_weight, N = (
+                    gate_weight.shape
+                )  # K_weight=2048 (hidden), N=1408 (intermediate)
+                L = 1
+
+                # Verify dimensions match for matrix multiplication
+                if K != K_weight:
+                    raise ValueError(
+                        f"Dimension mismatch: expert_tokens {expert_tokens.shape} vs gate_weight {gate_weight.shape}"
+                    )
+
+                # Create output tensors with intermediate_size (N=1408)
+                gate_output = torch.empty(M, N, dtype=self.DTYPE_TORCH, device=device)
+                up_output = torch.empty(M, N, dtype=self.DTYPE_TORCH, device=device)
+
+                # Add both projections to metadata
+                for weight, output, output_list in [
+                    (gate_weight, gate_output, gate_outputs),
+                    (up_weight, up_output, up_outputs),
+                ]:
+                    self._add_projection_to_metadata(
+                        expert_tokens,
+                        weight,
+                        output,
+                        problem_sizes,
+                        strides_abc,
+                        ptrs_abc,
+                    )
+                    output_list.append(output)
+
+        return problem_sizes, strides_abc, ptrs_abc, gate_outputs, up_outputs
+
+    def _execute_down_projection_gpu(
+        self, hidden_states, down_weights, m_sizes_gpu, device
+    ):
+        """Execute down projection using GPU operations."""
+        if not hidden_states:
+            return []
+
+        # Find valid experts
+        valid_mask = m_sizes_gpu > 0
+        valid_indices = torch.nonzero(valid_mask, as_tuple=True)[0]
+
+        # Prepare metadata
+        problem_sizes, strides_abc, ptrs_abc, down_outputs = (
+            self._prepare_down_metadata_gpu(
+                hidden_states, down_weights, valid_indices, device
+            )
+        )
+
+        if len(problem_sizes) == 0:
+            return []
+
+        # Execute grouped GEMM
+        self._execute_grouped_gemm(problem_sizes, strides_abc, ptrs_abc, device)
+
+        return down_outputs
+
+    def _prepare_down_metadata_gpu(
+        self, hidden_states, down_weights, valid_indices, device
+    ):
+        """Prepare metadata for down projection using GPU operations."""
+        problem_sizes = []
+        strides_abc = []
+        ptrs_abc = []
+        down_outputs = []
+
+        # Convert indices to CPU for iteration (minimal sync)
+        valid_indices_cpu = valid_indices.cpu().tolist()
+
+        for i, expert_idx in enumerate(valid_indices_cpu):
+            if i < len(hidden_states):
+                hidden = hidden_states[i]
+                down_weight = down_weights[
+                    expert_idx
+                ].contiguous()  # Already transposed
+
+                M, K = hidden.shape  # [batch, intermediate_size] = [12288, 1408]
+                # Pre-transposed down weights: [intermediate_size, hidden_size] = [1408, 2048]
+                K_weight, N = (
+                    down_weight.shape
+                )  # K_weight=1408 (intermediate), N=2048 (hidden)
+
+                # Verify dimensions match for matrix multiplication
+                if K != K_weight:
+                    raise ValueError(
+                        f"Dimension mismatch: hidden {hidden.shape} vs down_weight {down_weight.shape}"
+                    )
+
+                # Create output tensor with hidden_size (N=2048)
+                down_output = torch.empty(M, N, dtype=self.DTYPE_TORCH, device=device)
+                down_outputs.append(down_output)
+
+                # Add to metadata
+                self._add_projection_to_metadata(
+                    hidden,
+                    down_weight,
+                    down_output,
+                    problem_sizes,
+                    strides_abc,
+                    ptrs_abc,
+                )
+
+        return problem_sizes, strides_abc, ptrs_abc, down_outputs
+
+    def _add_projection_to_metadata(
+        self,
+        input_tensor,
+        weight_tensor,
+        output_tensor,
+        problem_sizes,
+        strides_abc,
+        ptrs_abc,
+    ):
+        """Add a single projection to the metadata lists with pre-transposed weights."""
+        M, K = input_tensor.shape
+        # Pre-transposed weights have shape [K, N] where K matches input's last dim
+        K_weight, N = weight_tensor.shape
+        L = 1
+
+        # Verify dimension compatibility
+        if K != K_weight:
+            raise ValueError(
+                f"Matrix multiplication dimension mismatch: input {input_tensor.shape} vs weight {weight_tensor.shape}"
+            )
+
+        # Convert to MNKL format
+        input_mnkl = input_tensor.unsqueeze(-1).contiguous()
+        weight_mnkl = weight_tensor.unsqueeze(-1).contiguous()
+        output_mnkl = output_tensor.unsqueeze(-1).contiguous()
+
+        # Extract strides
+        input_strides = list(input_mnkl.stride()[:2])
+        weight_strides = list(weight_mnkl.stride()[:2])
+        output_strides = list(output_mnkl.stride()[:2])
+
+        # Add to metadata
+        problem_sizes.append([M, N, K, L])
+        strides_abc.append([input_strides, weight_strides, output_strides])
+        ptrs_abc.append(
+            [
+                input_tensor.data_ptr(),
+                weight_tensor.data_ptr(),
+                output_tensor.data_ptr(),
+            ]
+        )
+
+    def _execute_grouped_gemm(self, problem_sizes, strides_abc, ptrs_abc, device):
+        """Execute the grouped GEMM kernel - EXACT copy of working version."""
+        num_groups = len(problem_sizes)
+
+        # Convert to CUTE tensors
+        problem_sizes_cute, strides_cute, ptrs_cute = self._convert_to_cute_tensors(
+            problem_sizes, strides_abc, ptrs_abc, device
+        )
+
+        # Get tensormap and compute clusters
+        tensormap_cute = self._get_tensormap_buffer(device)
+        total_clusters = self._compute_total_clusters(problem_sizes)
+
+        # Get initial tensors for compilation
+        initial_tensors = self._create_initial_tensors(problem_sizes[0], device)
+
+        # Compile or retrieve kernel
+        compiled_kernel = self._get_compiled_kernel(
+            num_groups,
+            total_clusters,
+            initial_tensors,
+            problem_sizes_cute,
+            strides_cute,
+            ptrs_cute,
+            tensormap_cute,
+        )
+
+        # Execute kernel
+        compiled_kernel(
+            *initial_tensors,
+            problem_sizes_cute,
+            strides_cute,
+            ptrs_cute,
+            tensormap_cute,
+            self.stream,
+        )
+        torch.cuda.synchronize()
+
+    def _convert_to_cute_tensors(self, problem_sizes, strides_abc, ptrs_abc, device):
+        """Convert metadata to CUTE tensors - EXACT copy of working version."""
+        problem_sizes_tensor = torch.tensor(
+            problem_sizes, dtype=torch.int32, device=device
+        )
+        strides_tensor = torch.tensor(strides_abc, dtype=torch.int32, device=device)
+        ptrs_tensor = torch.tensor(ptrs_abc, dtype=torch.int64, device=device)
+
+        return (
+            from_dlpack(problem_sizes_tensor, assumed_align=self.ALIGNMENT),
+            from_dlpack(strides_tensor, assumed_align=self.ALIGNMENT),
+            from_dlpack(ptrs_tensor, assumed_align=self.ALIGNMENT),
+        )
+
+    def _get_compiled_kernel(
+        self,
+        num_groups,
+        total_clusters,
+        initial_tensors,
+        problem_sizes_cute,
+        strides_cute,
+        ptrs_cute,
+        tensormap_cute,
+    ):
+        """Get or compile the grouped GEMM kernel - EXACT copy of working version."""
+        cache_key = (
+            num_groups,
+            total_clusters,
+            self.use_2cta_instrs,
+            self.mma_tiler_mn,
+            self.cluster_shape_mn,
+        )
+
+        if cache_key not in self._compiled_kernels:
+            print(
+                f"Compiling CUTLASS grouped GEMM kernel: {num_groups} groups, 2CTA={self.use_2cta_instrs}, cluster={self.cluster_shape_mn}"
+            )
+
+            self._compiled_kernels[cache_key] = cute.compile(
+                self.grouped_gemm,
+                *initial_tensors,
+                num_groups,
+                problem_sizes_cute,
+                strides_cute,
+                ptrs_cute,
+                total_clusters,
+                tensormap_cute,
+                self.max_active_clusters,
+                self.stream,
+            )
+            print("Kernel compilation successful")
+
+        return self._compiled_kernels[cache_key]
+
+    def _create_initial_tensors(self, problem_shape, device):
+        """Create initial CUTE tensors for kernel compilation - EXACT copy of working version."""
+        M, N, K, L = problem_shape
+
+        # Create tensors - SAME AS WORKING VERSION
+        tensors = [
+            torch.randn(M, K, dtype=self.DTYPE_TORCH, device=device),  # A
+            torch.randn(N, K, dtype=self.DTYPE_TORCH, device=device),  # B
+            torch.zeros(M, N, dtype=self.DTYPE_TORCH, device=device),  # C
+        ]
+
+        # Convert to MNKL format and create CUTE tensors
+        cute_tensors = []
+        for tensor in tensors:
+            mnkl_tensor = tensor.unsqueeze(-1).contiguous()
+            cute_tensor = from_dlpack(mnkl_tensor, assumed_align=self.ALIGNMENT)
+            cute_tensor.element_type = self.DTYPE_CUTLASS
+            cute_tensor = cute_tensor.mark_layout_dynamic(leading_dim=1)
+            cute_tensors.append(cute_tensor)
+
+        return cute_tensors
+
+    def _get_tensormap_buffer(self, device):
+        """Get or create tensormap buffer - EXACT copy of working version."""
+        if device not in self._tensormap_buffers:
+            sm_count = self.hardware_info.get_max_active_clusters(1)
+            tensormap_tensor = torch.zeros(
+                (sm_count, self.TENSORMAP_COUNT, self.TENSORMAP_BYTES // 8),
+                dtype=torch.int64,
+                device=device,
+            )
+            self._tensormap_buffers[device] = from_dlpack(
+                tensormap_tensor, assumed_align=self.ALIGNMENT
+            )
+
+        return self._tensormap_buffers[device]
+
+    def _compute_total_clusters(self, problem_sizes):
+        """Compute total number of clusters needed - EXACT copy of working version."""
+        cluster_tile_m = self.mma_tiler_mn[0]
+        cluster_tile_n = self.mma_tiler_mn[1]
+
+        # Adjust for 2 CTA mode and cluster shape
+        if self.use_2cta_instrs:
+            cluster_tile_m //= 2
+
+        cluster_tile_m *= self.cluster_shape_mn[0]
+        cluster_tile_n *= self.cluster_shape_mn[1]
+
+        total = 0
+        for M, N, K, L in problem_sizes:
+            clusters_m = (M + cluster_tile_m - 1) // cluster_tile_m
+            clusters_n = (N + cluster_tile_n - 1) // cluster_tile_n
+            total += clusters_m * clusters_n
+
+        return total
+
+    def _apply_activation_and_combine(self, gate_outputs, up_outputs):
+        """Apply activation and combine gate/up outputs - EXACT copy of working version."""
+        return [
+            self.activation_function(gate_out) * up_out
+            for gate_out, up_out in zip(gate_outputs, up_outputs)
+        ]
+
+    def _reconstruct_output_gpu(
+        self, final_outputs, m_sizes_gpu, m_offsets_gpu, output
+    ):
+        """Reconstruct the full output tensor using GPU operations - EXACT copy of working version."""
+        if not final_outputs:
+            return output
+
+        # Find valid experts
+        valid_mask = m_sizes_gpu > 0
+        valid_indices = torch.nonzero(valid_mask, as_tuple=True)[0]
+        valid_sizes = m_sizes_gpu[valid_indices]
+
+        # Compute offsets if not provided properly
+        if len(m_offsets_gpu) <= len(valid_indices):
+            valid_offsets = torch.cumsum(
+                torch.cat(
+                    [torch.tensor([0], device=m_sizes_gpu.device), valid_sizes[:-1]]
+                ),
+                dim=0,
+            )
+        else:
+            valid_offsets = m_offsets_gpu[valid_indices]
+
+        # Convert to CPU for final reconstruction (minimal sync)
+        valid_sizes_cpu = valid_sizes.cpu().tolist()
+        valid_offsets_cpu = valid_offsets.cpu().tolist()
+
+        for i, (size, offset) in enumerate(zip(valid_sizes_cpu, valid_offsets_cpu)):
+            if i < len(final_outputs):
+                output[offset : offset + size] = final_outputs[i]
+
+        return output
+
+    @staticmethod
+    def is_available() -> bool:
+        return HAS_CUTLASS
+
+
+# =================== prev version ===================
+
+
+class CUTLASSGroupedGemmStrategy_external_converter(GroupGEMMStrategy):
+    """
+    Improved CUTLASS grouped GEMM strategy using converter classes.
+
+    This version provides cleaner code with better separation of concerns:
+    - Tensor conversion is handled by dedicated converter classes
+    - Reduced boilerplate and manual tensor manipulation
+    - Better error handling and validation
+    - More maintainable codebase
+    """
+
+    # Configuration constants
+    SUPPORTED_CLUSTER_SHAPES = [
+        (1, 1),
+        (1, 2),
+        (1, 4),
+        (2, 1),
+        (2, 2),
+        (2, 4),
+        (4, 1),
+        (4, 2),
+        (4, 4),
+    ]
+
+    SINGLE_CTA_M_SIZES = [128, 64]
+    DUAL_CTA_M_SIZES = [256, 128]
+    N_SIZE_RANGE = range(32, 257, 32)
+
+    DTYPE_TORCH = torch.bfloat16
+    DTYPE_CUTLASS = cutlass.BFloat16
+    ACC_DTYPE = cutlass.Float32
+    ALIGNMENT = 16
+
+    def __init__(
+        self,
+        custom_activation,
+        use_2cta_instrs: bool = True,
+        mma_tiler_mn: Tuple[int, int] = (256, 128),
+        cluster_shape_mn: Tuple[int, int] = (4, 4),
+    ):
+        """
+        Initialize the improved CUTLASS grouped GEMM strategy.
+
+        Args:
+            custom_activation: Activation function (e.g., SiLU)
+            use_2cta_instrs: Whether to use 2-CTA instructions
+            mma_tiler_mn: MMA tile sizes (M, N)
+            cluster_shape_mn: Cluster shape (M, N)
+        """
+        if not HAS_CUTLASS:
+            raise RuntimeError("CUTLASS not available")
+
+        self.activation_function = custom_activation
+        self.use_2cta_instrs = use_2cta_instrs
+        self.mma_tiler_mn = mma_tiler_mn or self._get_default_mma_tiler()
+        self.cluster_shape_mn = cluster_shape_mn or self._get_default_cluster_shape()
+
+        # Initialize converter and tensor manager
+        self.converter = PyTorchToCuteConverter(
+            default_alignment=self.ALIGNMENT, default_acc_dtype=self.ACC_DTYPE
+        )
+        self.tensor_manager = GroupedGemmTensorManager(
+            alignment=self.ALIGNMENT, dtype=self.DTYPE_TORCH
+        )
+
+        # Initialize CUTLASS components
+        self._initialize_kernel()
+        self._initialize_hardware()
+
+        # Caches
+        self._compiled_kernels = {}
+        self._tensormap_buffers = {}
+
+        self._log_initialization()
+
+    def _get_default_mma_tiler(self) -> Tuple[int, int]:
+        """Get default MMA tiler based on CTA mode."""
+        return (256, 128) if self.use_2cta_instrs else (128, 128)
+
+    def _get_default_cluster_shape(self) -> Tuple[int, int]:
+        """Get default cluster shape based on CTA mode."""
+        return (2, 2) if self.use_2cta_instrs else (1, 1)
+
+    def _initialize_kernel(self):
+        """Initialize the CUTLASS grouped GEMM kernel."""
+        self.grouped_gemm = GroupedGemmKernel(
+            acc_dtype=self.ACC_DTYPE,
+            use_2cta_instrs=self.use_2cta_instrs,
+            mma_tiler_mn=self.mma_tiler_mn,
+            cluster_shape_mn=self.cluster_shape_mn,
+            tensormap_update_mode=utils.TensorMapUpdateMode.SMEM,
+        )
+
+    def _initialize_hardware(self):
+        """Initialize hardware information and CUDA stream."""
+        # TODO - if we do not have a cuda context, this will fail...
+        self.hardware_info = utils.HardwareInfo()
+        self.max_active_clusters = self.hardware_info.get_max_active_clusters(
+            self.cluster_shape_mn[0] * self.cluster_shape_mn[1]
+        )
+
+        torch_stream = torch.cuda.current_stream()
+        self.stream = cuda.CUstream(torch_stream.cuda_stream)
+
+    def _log_initialization(self):
+        """Log initialization details."""
+        cluster_size = self.cluster_shape_mn[0] * self.cluster_shape_mn[1]
+        print(f"✅ Improved CUTLASS Strategy initialized:")
+        print(f"   - 2 CTA mode: {self.use_2cta_instrs}")
+        print(f"   - MMA tiler: {self.mma_tiler_mn}")
+        print(f"   - Cluster shape: {self.cluster_shape_mn}")
+        print(f"   - Max active clusters: {self.max_active_clusters}")
+
+    def arrange_expert_weights(
+        self, all_weights: List[torch.Tensor], submod_name: str, module
+    ) -> torch.Tensor:
+        """Store weights in stacked format."""
+        # TODO - let's pre-transsose...
+        return torch.stack(all_weights)
+
+    def execute(
+        self,
+        contig_tokens: torch.Tensor,
+        m_sizes: torch.Tensor,
+        m_offsets: torch.Tensor,
+        module,
+    ) -> torch.Tensor:
+        """
+        Execute grouped GEMM operation using improved tensor management.
+
+        Args:
+            contig_tokens: Input tokens arranged contiguously by expert
+            m_sizes: Expert sizes tensor
+            m_offsets: Expert offsets tensor
+            module: MoE module containing weights
+
+        Returns:
+            Processed output tokens
+        """
+        # Ensure GPU tensors to avoid CPU-GPU sync
+        m_sizes_gpu, m_offsets_gpu = self._ensure_gpu_tensors(
+            m_sizes, m_offsets, contig_tokens.device
+        )
+
+        # Get weights and validate
+        weights = self._get_and_validate_weights(module)
+        device = contig_tokens.device
+
+        # Early exit if no valid experts
+        if not self._has_valid_experts(m_sizes_gpu):
+            return torch.zeros(
+                contig_tokens.shape[0],
+                weights["gate"].shape[2],
+                dtype=self.DTYPE_TORCH,
+                device=device,
+            )
+
+        # Execute three-stage MoE computation
+        return self._execute_moe_computation(
+            contig_tokens, weights, m_sizes_gpu, m_offsets_gpu, device
+        )
+
+    def _ensure_gpu_tensors(
+        self, m_sizes, m_offsets, device
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Ensure sizes and offsets are GPU tensors."""
+        if not isinstance(m_sizes, torch.Tensor):
+            m_sizes_gpu = torch.tensor(m_sizes, dtype=torch.int32, device=device)
+        else:
+            m_sizes_gpu = m_sizes.to(device=device, dtype=torch.int32)
+
+        if not isinstance(m_offsets, torch.Tensor):
+            m_offsets_gpu = torch.tensor(m_offsets, dtype=torch.int32, device=device)
+        else:
+            m_offsets_gpu = m_offsets.to(device=device, dtype=torch.int32)
+
+        return m_sizes_gpu, m_offsets_gpu
+
+    def _get_and_validate_weights(self, module) -> Dict[str, torch.Tensor]:
+        """Extract and validate weight tensors."""
+        required_weights = ["gate_proj_weight", "up_proj_weight", "down_proj_weight"]
+        weights = {}
+
+        for weight_name in required_weights:
+            if not hasattr(module, weight_name):
+                raise ValueError(f"Module missing required weight: {weight_name}")
+            weights[weight_name.split("_")[0]] = module.get_parameter(weight_name)
+
+        return weights
+
+    def _has_valid_experts(self, m_sizes_gpu: torch.Tensor) -> bool:
+        """Check if any experts have tokens (single sync point)."""
+        return torch.any(m_sizes_gpu > 0).item()
+
+    def _execute_moe_computation(
+        self,
+        contig_tokens: torch.Tensor,
+        weights: Dict[str, torch.Tensor],
+        m_sizes_gpu: torch.Tensor,
+        m_offsets_gpu: torch.Tensor,
+        device: torch.device,
+    ) -> torch.Tensor:
+        """Execute the complete MoE computation pipeline."""
+        print(f"⚙️  Executing MoE computation on {device}")
+        print(f"Stage 1: Gate and Up projections")
+        if m_sizes_gpu.requires_grad:
+            m_sizes_gpu = m_sizes_gpu.detach()
+            m_offsets_gpu = m_offsets_gpu.detach()
+
+        # Stage 1: Gate and Up projections
+        gate_outputs, up_outputs = self._execute_gate_up_projections(
+            contig_tokens,
+            weights["gate"].detach(),
+            weights["up"].detach(),
+            m_sizes_gpu,
+            m_offsets_gpu,
+            device,
+        )
+
+        print(f"Stage 2: Apply activation and combine")
+        # Stage 2: Apply activation and combine
+        hidden_states = self._apply_activation_and_combine(gate_outputs, up_outputs)
+
+        # Stage 3: Down projection
+        print(f"Stage 3: Down projection")
+        down_outputs = self._execute_down_projection(
+            hidden_states, weights["down"].detach(), m_sizes_gpu, device
+        )
+
+        # Stage 4: Reconstruct output
+        print(f"Stage 4: Reconstruct output")
+        return self._reconstruct_output(
+            down_outputs, contig_tokens, m_sizes_gpu, m_offsets_gpu
+        )
+
+    def _execute_gate_up_projections(
+        self,
+        input_tokens: torch.Tensor,
+        gate_weights: torch.Tensor,
+        up_weights: torch.Tensor,
+        m_sizes_gpu: torch.Tensor,
+        m_offsets_gpu: torch.Tensor,
+        device: torch.device,
+    ) -> Tuple[List, List]:
+        """Execute gate and up projections using the tensor manager."""
+
+        # Get valid experts
+        valid_mask = m_sizes_gpu > 0
+        valid_indices = torch.nonzero(valid_mask, as_tuple=True)[0]
+
+        if len(valid_indices) == 0:
+            return [], []
+
+        # Prepare expert operations using tensor manager
+        gate_ops, up_ops = self._prepare_gate_up_operations(
+            input_tokens,
+            gate_weights,
+            up_weights,
+            m_sizes_gpu,
+            m_offsets_gpu,
+            valid_indices,
+            device,
+        )
+
+        # Execute grouped GEMMs
+        if gate_ops["inputs"]:
+            self._execute_grouped_gemm_operations(gate_ops, device, "gate_up")
+
+        return gate_ops["outputs"], up_ops["outputs"]
+
+    def _prepare_gate_up_operations(
+        self,
+        input_tokens: torch.Tensor,
+        gate_weights: torch.Tensor,
+        up_weights: torch.Tensor,
+        m_sizes_gpu: torch.Tensor,
+        m_offsets_gpu: torch.Tensor,
+        valid_indices: torch.Tensor,
+        device: torch.device,
+    ) -> Tuple[Dict, Dict]:
+        """Prepare gate and up operations using the tensor manager."""
+
+        # Convert indices for iteration (minimal sync)
+        valid_indices_cpu = valid_indices.cpu().tolist()
+        valid_sizes = m_sizes_gpu[valid_indices].cpu().tolist()
+        valid_offsets = (
+            self._compute_valid_offsets(
+                m_sizes_gpu, m_offsets_gpu, valid_indices, device
+            )
+            .cpu()
+            .tolist()
+        )
+
+        # Prepare operation lists
+        gate_ops = {"inputs": [], "weights": [], "outputs": [], "metadata": []}
+        up_ops = {"inputs": [], "weights": [], "outputs": [], "metadata": []}
+
+        for expert_idx, size, offset in zip(
+            valid_indices_cpu, valid_sizes, valid_offsets
+        ):
+            if size > 0:
+                # Get expert data
+                expert_input = input_tokens[offset : offset + size].contiguous()
+                gate_weight = gate_weights[expert_idx].contiguous()
+                up_weight = up_weights[expert_idx].contiguous()
+
+                # Create output tensors
+                M, K = expert_input.shape
+                N = gate_weight.shape[0]  # Assuming [out_features, in_features]
+
+                gate_output = torch.empty(M, N, dtype=self.DTYPE_TORCH, device=device)
+                up_output = torch.empty(M, N, dtype=self.DTYPE_TORCH, device=device)
+
+                # Use tensor manager to prepare operations
+                for ops, weight, output in [
+                    (gate_ops, gate_weight, gate_output),
+                    (up_ops, up_weight, up_output),
+                ]:
+
+                    (
+                        cute_input,
+                        cute_weight,
+                        cute_output,
+                        problem_size,
+                        strides,
+                        ptrs,
+                    ) = self.tensor_manager.prepare_expert_operation(
+                        expert_input, weight, output, transpose_weight=True
+                    )
+
+                    ops["inputs"].append(expert_input)
+                    ops["weights"].append(weight)
+                    ops["outputs"].append(output)
+                    ops["metadata"].append((problem_size, strides, ptrs))
+
+        return gate_ops, up_ops
+
+    def _compute_valid_offsets(
+        self,
+        m_sizes_gpu: torch.Tensor,
+        m_offsets_gpu: torch.Tensor,
+        valid_indices: torch.Tensor,
+        device: torch.device,
+    ) -> torch.Tensor:
+        """Compute valid offsets for expert operations."""
+        valid_sizes = m_sizes_gpu[valid_indices]
+
+        if len(m_offsets_gpu) > len(valid_indices):
+            return m_offsets_gpu[valid_indices]
+        else:
+            return torch.cumsum(
+                torch.cat([torch.tensor([0], device=device), valid_sizes[:-1]]), dim=0
+            )
+
+    def _execute_down_projection(
+        self,
+        hidden_states: List[torch.Tensor],
+        down_weights: torch.Tensor,
+        m_sizes_gpu: torch.Tensor,
+        device: torch.device,
+    ) -> List[torch.Tensor]:
+        """Execute down projection using tensor manager."""
+
+        if not hidden_states:
+            return []
+
+        # Get valid expert indices
+        valid_mask = m_sizes_gpu > 0
+        valid_indices = torch.nonzero(valid_mask, as_tuple=True)[0]
+        valid_indices_cpu = valid_indices.cpu().tolist()
+
+        # Prepare down projection operations
+        down_ops = {"inputs": [], "weights": [], "outputs": [], "metadata": []}
+
+        for i, expert_idx in enumerate(valid_indices_cpu):
+            if i < len(hidden_states):
+                hidden = hidden_states[i]
+                down_weight = down_weights[expert_idx].contiguous()
+
+                M, K = hidden.shape
+                N = down_weight.shape[0]
+                down_output = torch.empty(M, N, dtype=self.DTYPE_TORCH, device=device)
+
+                # Use tensor manager
+                cute_input, cute_weight, cute_output, problem_size, strides, ptrs = (
+                    self.tensor_manager.prepare_expert_operation(
+                        hidden, down_weight, down_output, transpose_weight=True
+                    )
+                )
+
+                down_ops["inputs"].append(hidden)
+                down_ops["weights"].append(down_weight)
+                down_ops["outputs"].append(down_output)
+                down_ops["metadata"].append((problem_size, strides, ptrs))
+
+        # Execute grouped GEMM
+        if down_ops["inputs"]:
+            self._execute_grouped_gemm_operations(down_ops, device, "down")
+
+        return down_ops["outputs"]
+
+    def _execute_grouped_gemm_operations(
+        self, operations: Dict, device: torch.device, stage_name: str
+    ):
+        """Execute grouped GEMM operations using converter."""
+
+        if not operations["metadata"]:
+            return
+
+        # Extract metadata
+        all_problem_sizes = []
+        all_strides = []
+        all_ptrs = []
+
+        for problem_size, strides, ptrs in operations["metadata"]:
+            all_problem_sizes.append(problem_size)
+            all_strides.append(strides)
+            all_ptrs.append(ptrs)
+
+        # Create CUTE metadata tensors using converter
+        problem_sizes_cute, strides_cute, ptrs_cute = (
+            self.converter.create_metadata_tensors(
+                all_problem_sizes, all_strides, all_ptrs, device
+            )
+        )
+
+        # Get other required tensors
+        num_groups = len(all_problem_sizes)
+        total_clusters = self._compute_total_clusters(all_problem_sizes)
+        tensormap_cute = self._get_tensormap_buffer(device)
+
+        # Create initial tensors for compilation using converter
+        initial_tensors = self.converter.create_initial_compilation_tensors(
+            tuple(all_problem_sizes[0]), device, self.DTYPE_TORCH
+        )
+
+        # Get or Compile kernel
+        compiled_kernel = self._get_or_compile_kernel(
+            num_groups,
+            total_clusters,
+            initial_tensors,
+            problem_sizes_cute,
+            strides_cute,
+            ptrs_cute,
+            tensormap_cute,
+        )
+
+        # Execute
+        compiled_kernel(
+            *initial_tensors,
+            problem_sizes_cute,
+            strides_cute,
+            ptrs_cute,
+            tensormap_cute,
+            self.stream,
+        )
+        torch.cuda.synchronize()
+
+    def _get_or_compile_kernel(
+        self,
+        num_groups: int,
+        total_clusters: int,
+        initial_tensors: List,
+        problem_sizes_cute,
+        strides_cute,
+        ptrs_cute,
+        tensormap_cute,
+    ):
+        """Get compiled kernel from cache or compile new one."""
+
+        cache_key = (
+            num_groups,
+            total_clusters,
+            self.use_2cta_instrs,
+            self.mma_tiler_mn,
+            self.cluster_shape_mn,
+        )
+
+        if cache_key not in self._compiled_kernels:
+            print(f"Compiling kernel: {num_groups} groups, 2CTA={self.use_2cta_instrs}")
+
+            self._compiled_kernels[cache_key] = cute.compile(
+                self.grouped_gemm,
+                *initial_tensors,
+                num_groups,
+                problem_sizes_cute,
+                strides_cute,
+                ptrs_cute,
+                total_clusters,
+                tensormap_cute,
+                self.max_active_clusters,
+                self.stream,
+            )
+            print("✅ Kernel compilation successful")
+
+        return self._compiled_kernels[cache_key]
+
+    def _get_tensormap_buffer(self, device: torch.device):
+        """Get tensormap buffer using converter."""
+        if device not in self._tensormap_buffers:
+            sm_count = self.hardware_info.get_max_active_clusters(1)
+            self._tensormap_buffers[device] = self.converter.create_tensormap_buffer(
+                device, sm_count, tensormap_count=3, tensormap_bytes=128
+            )
+        return self._tensormap_buffers[device]
+
+    def _compute_total_clusters(self, problem_sizes: List[List[int]]) -> int:
+        """Compute total clusters needed for all problems."""
+        cluster_tile_m = self.mma_tiler_mn[0]
+        cluster_tile_n = self.mma_tiler_mn[1]
+
+        if self.use_2cta_instrs:
+            cluster_tile_m //= 2
+
+        cluster_tile_m *= self.cluster_shape_mn[0]
+        cluster_tile_n *= self.cluster_shape_mn[1]
+
+        total = 0
+        for M, N, K, L in problem_sizes:
+            clusters_m = (M + cluster_tile_m - 1) // cluster_tile_m
+            clusters_n = (N + cluster_tile_n - 1) // cluster_tile_n
+            total += clusters_m * clusters_n
+
+        return total
+
+    def _apply_activation_and_combine(
+        self, gate_outputs: List[torch.Tensor], up_outputs: List[torch.Tensor]
+    ) -> List[torch.Tensor]:
+        """Apply activation function and combine gate/up outputs."""
+        if not gate_outputs or not up_outputs:
+            return []
+
+        return [
+            self.activation_function(gate_out) * up_out
+            for gate_out, up_out in zip(gate_outputs, up_outputs)
+        ]
+
+    def _reconstruct_output(
+        self,
+        down_outputs: List[torch.Tensor],
+        contig_tokens: torch.Tensor,
+        m_sizes_gpu: torch.Tensor,
+        m_offsets_gpu: torch.Tensor,
+    ) -> torch.Tensor:
+        """Reconstruct the full output tensor."""
+
+        # Initialize output
+        output = torch.zeros(
+            contig_tokens.shape[0],
+            down_outputs[0].shape[1] if down_outputs else contig_tokens.shape[1],
+            dtype=self.DTYPE_TORCH,
+            device=contig_tokens.device,
+        )
+
+        if not down_outputs:
+            return output
+
+        # Get valid expert information
+        valid_mask = m_sizes_gpu > 0
+        valid_indices = torch.nonzero(valid_mask, as_tuple=True)[0]
+        valid_sizes = m_sizes_gpu[valid_indices].cpu().tolist()
+        valid_offsets = (
+            self._compute_valid_offsets(
+                m_sizes_gpu, m_offsets_gpu, valid_indices, contig_tokens.device
+            )
+            .cpu()
+            .tolist()
+        )
+
+        # Copy results back
+        for i, (size, offset) in enumerate(zip(valid_sizes, valid_offsets)):
+            if i < len(down_outputs) and size > 0:
+                output[offset : offset + size] = down_outputs[i]
+
+        return output
+
+    @staticmethod
+    def is_available() -> bool:
+        """Check if CUTLASS is available."""
+        return HAS_CUTLASS
diff --git a/torchtitan/experiments/deepseek_v3/generate.py b/torchtitan/experiments/deepseek_v3/generate.py
index b2fe217a8..50223f4ea 100644
--- a/torchtitan/experiments/deepseek_v3/generate.py
+++ b/torchtitan/experiments/deepseek_v3/generate.py
@@ -225,7 +225,7 @@ def generate(
     tokenizer,
     dist_config,
     messages: list[dict],
-    n_tokens: int = 200,
+    n_tokens: int = 80,
 ):
     rank = dist.get_rank()
     device = dist_config.device

From 016ca470ddb4c129fc18ec3c8403e172ad2a3c88 Mon Sep 17 00:00:00 2001
From: lessw2020 <lessw@etrillium.com>
Date: Sun, 22 Jun 2025 08:13:24 -0700
Subject: [PATCH 32/34] remove torch.cuda.synchronize

---
 torchtitan/experiments/deepseek_v3/cutlass_grouped_gemm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchtitan/experiments/deepseek_v3/cutlass_grouped_gemm.py b/torchtitan/experiments/deepseek_v3/cutlass_grouped_gemm.py
index 1e4f51960..68f35581a 100644
--- a/torchtitan/experiments/deepseek_v3/cutlass_grouped_gemm.py
+++ b/torchtitan/experiments/deepseek_v3/cutlass_grouped_gemm.py
@@ -528,7 +528,7 @@ def _execute_grouped_gemm(self, problem_sizes, strides_abc, ptrs_abc, device):
             tensormap_cute,
             self.stream,
         )
-        torch.cuda.synchronize()
+        # torch.cuda.synchronize()
 
     def _convert_to_cute_tensors(self, problem_sizes, strides_abc, ptrs_abc, device):
         """Convert metadata to CUTE tensors."""

From c3ee57ca1839e70a77642b46b7203ffa151731d2 Mon Sep 17 00:00:00 2001
From: lessw2020 <lessw@etrillium.com>
Date: Sun, 22 Jun 2025 11:00:25 -0700
Subject: [PATCH 33/34] reasonable working version

---
 .../deepseek_v3/cutlass_grouped_gemm.py       | 711 +++++++++++++++++-
 .../experiments/deepseek_v3/inference.sh      |   2 +
 2 files changed, 694 insertions(+), 19 deletions(-)

diff --git a/torchtitan/experiments/deepseek_v3/cutlass_grouped_gemm.py b/torchtitan/experiments/deepseek_v3/cutlass_grouped_gemm.py
index 68f35581a..ad02229d4 100644
--- a/torchtitan/experiments/deepseek_v3/cutlass_grouped_gemm.py
+++ b/torchtitan/experiments/deepseek_v3/cutlass_grouped_gemm.py
@@ -1,10 +1,5 @@
 """
-    Strategy using CUTLASS GroupedGemmKernel for group GEMM operations on Blackwell architecture.
-
-    Optimized version with pre-transposed weights to eliminate runtime transpose operations.
-"""
-
-"""
+    Strategy using CUTLASS GroupedGemmKernel for group GEMM operations on Blackwell.
 
 """
 
@@ -15,10 +10,12 @@
 # export CUTE_DSL_FILE_CACHING_CAPACITY=1000
 
 import logging
-from typing import Any, Dict, List, Tuple
+from typing import Any, Dict, List, Optional, Tuple
 
 import torch
 
+from .group_gemms import GroupGEMMStrategy
+
 
 try:
     import cuda.bindings.driver as cuda
@@ -43,8 +40,6 @@
     PyTorchToCuteConverter,
 )
 
-# Import base class - adjust path as needed based on your project structure
-from .group_gemms import GroupGEMMStrategy
 
 logger = logging.getLogger(__name__)
 
@@ -59,7 +54,7 @@ class CUTLASSGroupedGemmStrategy(GroupGEMMStrategy):
     This version eliminates CPU-GPU synchronization by keeping all size/offset computations on GPU.
     """
 
-    # Constants (same as before)
+    # ----- Config Constants --------
     SUPPORTED_CLUSTER_SHAPES = [
         (1, 1),
         (1, 2),
@@ -83,6 +78,8 @@ class CUTLASSGroupedGemmStrategy(GroupGEMMStrategy):
     TENSORMAP_COUNT = 3
     TENSORMAP_BYTES = 128
 
+    # ------- end constants ------- #
+
     def __init__(
         self,
         custom_activation,
@@ -90,17 +87,22 @@ def __init__(
         mma_tiler_mn=(256, 128),
         cluster_shape_mn=(4, 4),
     ):
-        """Initialize the CUTLASS grouped GEMM strategy for Blackwell architecture."""
+        """Initialize the CUTLASS grouped GEMM strategy."""
         super().__init__(custom_activation)
-        self.use_2cta_instrs = use_2cta_instrs
 
         # Set configuration
+        self.use_2cta_instrs = use_2cta_instrs
         self.mma_tiler_mn = mma_tiler_mn or self._get_default_mma_tiler()
         self.cluster_shape_mn = cluster_shape_mn or self._get_default_cluster_shape()
 
         # Validate configurations
         # self._validate_configurations()
 
+        print(f"Initializing CUTLASSGroupedGemmStrategy for Blackwell with:")
+        print(f"  - 2 CTA instructions: {self.use_2cta_instrs}")
+        print(f"  - MMA tiler (M, N): {mma_tiler_mn}")
+        print(f"  - Cluster shape (M, N): {cluster_shape_mn}")
+
         # Initialize kernel and hardware info
         self._initialize_kernel()
         self._initialize_hardware()
@@ -109,7 +111,7 @@ def __init__(
         self._compiled_kernels = {}
         self._tensormap_buffers = {}
 
-        self._log_initialization()
+    # self._log_initialization()
 
     def _get_default_mma_tiler(self):
         """Get default MMA tiler configuration based on CTA mode."""
@@ -139,6 +141,7 @@ def _initialize_hardware(self):
         torch_stream = torch.cuda.current_stream()
         self.stream = cuda.CUstream(torch_stream.cuda_stream)
 
+    # ------ validations ------
     def _validate_configurations(self):
         """Validate configurations for Blackwell."""
         self._validate_mma_tiler()
@@ -183,6 +186,8 @@ def _validate_2cta_constraints(self):
                 f"Valid 2 CTA cluster shapes: {valid_2cta_shapes}"
             )
 
+    # ------ end validations ------
+
     def _log_initialization(self):
         """Log initialization information."""
         cluster_size = self.cluster_shape_mn[0] * self.cluster_shape_mn[1]
@@ -200,7 +205,7 @@ def arrange_expert_weights(self, all_weights, submod_name, module):
 
     def execute(self, contig_tokens, m_sizes, m_offsets, module):
         """
-        Execute using CUTLASS grouped GEMM kernel - GPU-only version.
+        Execute using CUTLASS grouped GEMM kernel - try to minimize cpu-gpu syncs.
 
         Args:
             contig_tokens: Input tokens arranged contiguously by expert
@@ -268,9 +273,7 @@ def _ensure_gpu_tensors(self, m_sizes, m_offsets, device):
 
     def _has_valid_experts_gpu(self, m_sizes_gpu):
         """Check if any experts have tokens using GPU operations (no sync)."""
-        return torch.any(
-            m_sizes_gpu > 0
-        ).item()  # Single sync here is unavoidable for control flow
+        return torch.any(m_sizes_gpu > 0).item()  # Single sync here
 
     def _validate_inputs(self, contig_tokens, m_sizes_gpu, module):
         """Validate input parameters."""
@@ -356,7 +359,7 @@ def _prepare_gate_up_metadata_gpu(
             )
         )
 
-        # Convert to Python for iteration (unavoidable in this test for metadata preparation)
+        # Convert to Python for iteration
         valid_sizes_cpu = valid_sizes.cpu().tolist()
         valid_offsets_cpu = valid_offsets.cpu().tolist()
         valid_indices_cpu = valid_indices.cpu().tolist()
@@ -563,6 +566,8 @@ def _get_compiled_kernel(
             self.cluster_shape_mn,
         )
 
+        # print(f"Cache key: {cache_key} ")
+
         if cache_key not in self._compiled_kernels:
             print(
                 f"Compiling CUTLASS grouped GEMM kernel: {num_groups} groups, 2CTA={self.use_2cta_instrs}, cluster={self.cluster_shape_mn}"
@@ -580,7 +585,7 @@ def _get_compiled_kernel(
                 self.max_active_clusters,
                 self.stream,
             )
-            print("Kernel compilation successful")
+            print(f"Kernel compilation successful, {self.cluster_shape_mn=}")
 
         return self._compiled_kernels[cache_key]
 
@@ -689,6 +694,673 @@ def is_available() -> bool:
 # ================== end, current working version ==================
 
 
+class CUTLASSGroupedGemmStrategy_dynamic_transpose(GroupGEMMStrategy):
+    """
+    Optimized CUTLASS grouped GEMM strategy with pre-transposed weights.
+
+    This version correctly handles pre-transposed weights throughout the entire
+    pipeline, from weight arrangement to kernel execution.
+    """
+
+    # Configuration constants
+    SUPPORTED_CLUSTER_SHAPES = [
+        (1, 1),
+        (1, 2),
+        (1, 4),
+        (2, 1),
+        (2, 2),
+        (2, 4),
+        (4, 1),
+        (4, 2),
+        (4, 4),
+    ]
+
+    SINGLE_CTA_M_SIZES = [128, 64]
+    DUAL_CTA_M_SIZES = [256, 128]
+    N_SIZE_RANGE = range(32, 257, 32)
+
+    DTYPE_TORCH = torch.bfloat16
+    DTYPE_CUTLASS = cutlass.BFloat16
+    ACC_DTYPE = cutlass.Float32
+    ALIGNMENT = 16
+    TENSORMAP_COUNT = 3
+    TENSORMAP_BYTES = 128
+
+    def __init__(
+        self,
+        custom_activation,
+        use_2cta_instrs: bool = True,
+        mma_tiler_mn: Optional[Tuple[int, int]] = None,
+        cluster_shape_mn: Optional[Tuple[int, int]] = None,
+        use_pretransposed_weights: bool = False,
+    ):
+        """
+        Initialize the optimized CUTLASS grouped GEMM strategy.
+
+        Args:
+            custom_activation: Activation function (e.g., SiLU)
+            use_2cta_instrs: Whether to use 2-CTA instructions
+            mma_tiler_mn: MMA tile sizes (M, N)
+            cluster_shape_mn: Cluster shape (M, N)
+            use_pretransposed_weights: Whether to pre-transpose weights
+        """
+        super().__init__(custom_activation)
+
+        # if not HAS_CUTLASS:
+        #    raise RuntimeError("CUTLASS not available")
+
+        self.use_2cta_instrs = use_2cta_instrs
+        self.use_pretransposed_weights = use_pretransposed_weights
+
+        # Set configuration
+        self.mma_tiler_mn = mma_tiler_mn or self._get_default_mma_tiler()
+        self.cluster_shape_mn = cluster_shape_mn or self._get_default_cluster_shape()
+
+        # Initialize components
+        self._initialize_kernel()
+        self._initialize_hardware()
+        self._initialize_converters()
+
+        # Initialize caches
+        self._compiled_kernels = {}
+        self._tensormap_buffers = {}
+
+        # self._log_initialization()
+
+    def _get_default_mma_tiler(self) -> Tuple[int, int]:
+        """Get default MMA tiler configuration."""
+        return (256, 128) if self.use_2cta_instrs else (128, 128)
+
+    def _get_default_cluster_shape(self) -> Tuple[int, int]:
+        """Get default cluster shape."""
+        return (2, 2) if self.use_2cta_instrs else (1, 1)
+
+    def _initialize_kernel(self):
+        """Initialize the CUTLASS grouped GEMM kernel."""
+        self.grouped_gemm = GroupedGemmKernel(
+            acc_dtype=self.ACC_DTYPE,
+            use_2cta_instrs=self.use_2cta_instrs,
+            mma_tiler_mn=self.mma_tiler_mn,
+            cluster_shape_mn=self.cluster_shape_mn,
+            tensormap_update_mode=utils.TensorMapUpdateMode.SMEM,
+        )
+
+    def _initialize_hardware(self):
+        """Initialize hardware information and stream."""
+        self.hardware_info = utils.HardwareInfo()
+        self.max_active_clusters = self.hardware_info.get_max_active_clusters(
+            self.cluster_shape_mn[0] * self.cluster_shape_mn[1]
+        )
+
+        torch_stream = torch.cuda.current_stream()
+        self.stream = cuda.CUstream(torch_stream.cuda_stream)
+
+    def _initialize_converters(self):
+        """Initialize converter utilities."""
+        self.converter = PyTorchToCuteConverter(
+            default_alignment=self.ALIGNMENT, default_acc_dtype=self.ACC_DTYPE
+        )
+        self.tensor_manager = GroupedGemmTensorManager(
+            alignment=self.ALIGNMENT, dtype=self.DTYPE_TORCH
+        )
+
+    def _log_initialization(self):
+        """Log initialization information."""
+        cluster_size = self.cluster_shape_mn[0] * self.cluster_shape_mn[1]
+
+        print(f"✅ Optimized CUTLASS Strategy initialized:")
+        print(f"   - 2 CTA instructions: {self.use_2cta_instrs}")
+        print(f"   - MMA tiler (M, N): {self.mma_tiler_mn}")
+        print(f"   - Cluster shape (M, N): {self.cluster_shape_mn}")
+        print(f"   - Pre-transposed weights: {self.use_pretransposed_weights}")
+        print(f"   - Cluster size: {cluster_size}")
+
+    def arrange_expert_weights(
+        self, all_weights: List[torch.Tensor], submod_name: str, module
+    ) -> torch.Tensor:
+        """
+        Store weights in stacked format with optional pre-transposition.
+
+        For pre-transposed mode:
+        - Gate/Up: [out_features, in_features] -> [in_features, out_features]
+        - Down: [out_features, in_features] -> [in_features, out_features]
+        """
+        if self.use_pretransposed_weights:
+            # Pre-transpose weights for optimal memory access patterns
+            transposed_weights = []
+            for weight in all_weights:
+                # Transpose and ensure contiguous memory layout
+                transposed_weights.append(weight.t().contiguous())
+            return torch.stack(transposed_weights)
+        else:
+            # Keep original layout
+            return torch.stack(all_weights)
+
+    def execute(
+        self,
+        contig_tokens: torch.Tensor,
+        m_sizes: torch.Tensor,
+        m_offsets: torch.Tensor,
+        module,
+    ) -> torch.Tensor:
+        """Execute grouped GEMM operation with optimized tensor handling."""
+        # Ensure GPU tensors
+        m_sizes_gpu, m_offsets_gpu = self._ensure_gpu_tensors(
+            m_sizes, m_offsets, contig_tokens.device
+        )
+
+        # Get weights
+        weights = self._get_weights(module)
+        device = contig_tokens.device
+
+        # Determine output size based on weight layout
+        if self.use_pretransposed_weights:
+            # Pre-transposed down weights: [experts, intermediate_size, hidden_size]
+            output_size = weights["down"].shape[2]
+        else:
+            # Original down weights: [experts, hidden_size, intermediate_size]
+            output_size = weights["down"].shape[1]
+
+        # Prepare output tensor
+        output = torch.zeros(
+            contig_tokens.shape[0],
+            output_size,
+            dtype=self.DTYPE_TORCH,
+            device=device,
+        )
+
+        # Check for valid experts
+        if not self._has_valid_experts_gpu(m_sizes_gpu):
+            return output
+
+        # Execute three-stage computation
+        gate_outputs, up_outputs = self._execute_projections_gpu(
+            contig_tokens,
+            weights["gate"],
+            weights["up"],
+            m_sizes_gpu,
+            m_offsets_gpu,
+            device,
+        )
+
+        hidden_states = self._apply_activation_and_combine(gate_outputs, up_outputs)
+
+        final_outputs = self._execute_down_projection_gpu(
+            hidden_states,
+            weights["down"],
+            m_sizes_gpu,
+            device,
+        )
+
+        return self._reconstruct_output_gpu(
+            final_outputs, m_sizes_gpu, m_offsets_gpu, output
+        )
+
+    def _ensure_gpu_tensors(self, m_sizes, m_offsets, device):
+        """Ensure sizes and offsets are GPU tensors."""
+        if not isinstance(m_sizes, torch.Tensor):
+            m_sizes_gpu = torch.tensor(m_sizes, dtype=torch.int32, device=device)
+        else:
+            m_sizes_gpu = m_sizes.to(device=device, dtype=torch.int32)
+
+        if not isinstance(m_offsets, torch.Tensor):
+            m_offsets_gpu = torch.tensor(m_offsets, dtype=torch.int32, device=device)
+        else:
+            m_offsets_gpu = m_offsets.to(device=device, dtype=torch.int32)
+
+        return m_sizes_gpu, m_offsets_gpu
+
+    def _has_valid_experts_gpu(self, m_sizes_gpu):
+        """Check if any experts have tokens."""
+        return torch.any(m_sizes_gpu > 0).item()
+
+    def _get_weights(self, module):
+        """Extract weight tensors from module."""
+        return {
+            "gate": module.get_parameter("gate_proj_weight"),
+            "up": module.get_parameter("up_proj_weight"),
+            "down": module.get_parameter("down_proj_weight"),
+        }
+
+    def _execute_projections_gpu(
+        self, input_tokens, weight1, weight2, m_sizes_gpu, m_offsets_gpu, device
+    ):
+        """Execute gate and up projections."""
+        valid_mask = m_sizes_gpu > 0
+        valid_indices = torch.nonzero(valid_mask, as_tuple=True)[0]
+
+        if len(valid_indices) == 0:
+            return [], []
+
+        # Prepare metadata
+        problem_sizes, strides_abc, ptrs_abc, gate_outputs, up_outputs = (
+            self._prepare_gate_up_metadata_gpu(
+                input_tokens,
+                weight1,
+                weight2,
+                m_sizes_gpu,
+                m_offsets_gpu,
+                valid_indices,
+                device,
+            )
+        )
+
+        if len(problem_sizes) == 0:
+            return [], []
+
+        # Execute grouped GEMM
+        self._execute_grouped_gemm(problem_sizes, strides_abc, ptrs_abc, device)
+
+        return gate_outputs, up_outputs
+
+    def _prepare_gate_up_metadata_gpu(
+        self,
+        input_tokens,
+        gate_weights,
+        up_weights,
+        m_sizes_gpu,
+        m_offsets_gpu,
+        valid_indices,
+        device,
+    ):
+        """Prepare metadata for gate and up projections with proper weight handling."""
+        problem_sizes = []
+        strides_abc = []
+        ptrs_abc = []
+        gate_outputs = []
+        up_outputs = []
+
+        # Extract valid sizes and offsets
+        valid_sizes = m_sizes_gpu[valid_indices]
+        valid_offsets = self._compute_valid_offsets(
+            m_sizes_gpu, m_offsets_gpu, valid_indices, device
+        )
+
+        # Convert to CPU for iteration
+        valid_sizes_cpu = valid_sizes.cpu().tolist()
+        valid_offsets_cpu = valid_offsets.cpu().tolist()
+        valid_indices_cpu = valid_indices.cpu().tolist()
+
+        for i, (expert_idx, size, offset) in enumerate(
+            zip(valid_indices_cpu, valid_sizes_cpu, valid_offsets_cpu)
+        ):
+            if size > 0:
+                # Get expert data
+                expert_tokens = input_tokens[offset : offset + size].contiguous()
+                gate_weight = gate_weights[expert_idx].contiguous()
+                up_weight = up_weights[expert_idx].contiguous()
+
+                M, K = expert_tokens.shape
+
+                if self.use_pretransposed_weights:
+                    # Pre-transposed: [in_features, out_features]
+                    K_weight, N = gate_weight.shape
+                    if K != K_weight:
+                        raise ValueError(
+                            f"Dimension mismatch: tokens K={K} vs weight K={K_weight}"
+                        )
+                else:
+                    # Original: [out_features, in_features]
+                    N, K_weight = gate_weight.shape
+                    if K != K_weight:
+                        raise ValueError(
+                            f"Dimension mismatch: tokens K={K} vs weight K={K_weight}"
+                        )
+
+                L = 1
+
+                # Create output tensors
+                gate_output = torch.empty(M, N, dtype=self.DTYPE_TORCH, device=device)
+                up_output = torch.empty(M, N, dtype=self.DTYPE_TORCH, device=device)
+
+                # Add both projections to metadata
+                for weight, output, output_list in [
+                    (gate_weight, gate_output, gate_outputs),
+                    (up_weight, up_output, up_outputs),
+                ]:
+                    self._add_projection_to_metadata(
+                        expert_tokens,
+                        weight,
+                        output,
+                        problem_sizes,
+                        strides_abc,
+                        ptrs_abc,
+                        self.use_pretransposed_weights,
+                    )
+                    output_list.append(output)
+
+        return problem_sizes, strides_abc, ptrs_abc, gate_outputs, up_outputs
+
+    def _execute_down_projection_gpu(
+        self, hidden_states, down_weights, m_sizes_gpu, device
+    ):
+        """Execute down projection."""
+        if not hidden_states:
+            return []
+
+        valid_mask = m_sizes_gpu > 0
+        valid_indices = torch.nonzero(valid_mask, as_tuple=True)[0]
+
+        # Prepare metadata
+        problem_sizes, strides_abc, ptrs_abc, down_outputs = (
+            self._prepare_down_metadata_gpu(
+                hidden_states, down_weights, valid_indices, device
+            )
+        )
+
+        if len(problem_sizes) == 0:
+            return []
+
+        # Execute grouped GEMM
+        self._execute_grouped_gemm(problem_sizes, strides_abc, ptrs_abc, device)
+
+        return down_outputs
+
+    def _prepare_down_metadata_gpu(
+        self, hidden_states, down_weights, valid_indices, device
+    ):
+        """Prepare metadata for down projection."""
+        problem_sizes = []
+        strides_abc = []
+        ptrs_abc = []
+        down_outputs = []
+
+        valid_indices_cpu = valid_indices.cpu().tolist()
+
+        for i, expert_idx in enumerate(valid_indices_cpu):
+            if i < len(hidden_states):
+                hidden = hidden_states[i]
+                down_weight = down_weights[expert_idx].contiguous()
+
+                M, K = hidden.shape
+
+                if self.use_pretransposed_weights:
+                    # Pre-transposed: [intermediate_size, hidden_size]
+                    K_weight, N = down_weight.shape
+                    if K != K_weight:
+                        raise ValueError(
+                            f"Dimension mismatch: hidden K={K} vs weight K={K_weight}"
+                        )
+                else:
+                    # Original: [hidden_size, intermediate_size]
+                    N, K_weight = down_weight.shape
+                    if K != K_weight:
+                        raise ValueError(
+                            f"Dimension mismatch: hidden K={K} vs weight K={K_weight}"
+                        )
+
+                # Create output tensor
+                down_output = torch.empty(M, N, dtype=self.DTYPE_TORCH, device=device)
+                down_outputs.append(down_output)
+
+                # Add to metadata
+                self._add_projection_to_metadata(
+                    hidden,
+                    down_weight,
+                    down_output,
+                    problem_sizes,
+                    strides_abc,
+                    ptrs_abc,
+                    self.use_pretransposed_weights,
+                )
+
+        return problem_sizes, strides_abc, ptrs_abc, down_outputs
+
+    def _add_projection_to_metadata(
+        self,
+        input_tensor,
+        weight_tensor,
+        output_tensor,
+        problem_sizes,
+        strides_abc,
+        ptrs_abc,
+        is_pretransposed,
+    ):
+        """Add projection to metadata with correct handling for weight layout."""
+        M, K = input_tensor.shape
+
+        if is_pretransposed:
+            # Weight is [K, N]
+            K_weight, N = weight_tensor.shape
+        else:
+            # Weight is [N, K] - need to handle transpose via strides
+            N, K_weight = weight_tensor.shape
+
+        if K != K_weight:
+            raise ValueError(f"K dimension mismatch: {K} vs {K_weight}")
+
+        L = 1
+
+        # Convert to MNKL format
+        input_mnkl = input_tensor.unsqueeze(-1).contiguous()
+        weight_mnkl = weight_tensor.unsqueeze(-1).contiguous()
+        output_mnkl = output_tensor.unsqueeze(-1).contiguous()
+
+        # Extract strides
+        input_strides = list(input_mnkl.stride()[:2])
+        weight_strides = list(weight_mnkl.stride()[:2])
+        output_strides = list(output_mnkl.stride()[:2])
+
+        # For non-pretransposed weights, we need to swap strides to simulate transpose
+        if not is_pretransposed:
+            weight_strides = [weight_strides[1], weight_strides[0]]
+
+        # Add to metadata
+        problem_sizes.append([M, N, K, L])
+        strides_abc.append([input_strides, weight_strides, output_strides])
+        ptrs_abc.append(
+            [
+                input_tensor.data_ptr(),
+                weight_tensor.data_ptr(),
+                output_tensor.data_ptr(),
+            ]
+        )
+
+    def _execute_grouped_gemm(self, problem_sizes, strides_abc, ptrs_abc, device):
+        """Execute the grouped GEMM kernel."""
+        num_groups = len(problem_sizes)
+
+        # Convert to CUTE tensors
+        problem_sizes_cute, strides_cute, ptrs_cute = self._convert_to_cute_tensors(
+            problem_sizes, strides_abc, ptrs_abc, device
+        )
+
+        # Get tensormap and compute clusters
+        tensormap_cute = self._get_tensormap_buffer(device)
+        total_clusters = self._compute_total_clusters(problem_sizes)
+
+        # Get initial tensors for compilation
+        initial_tensors = self._create_initial_tensors(
+            problem_sizes[0], device, self.use_pretransposed_weights
+        )
+
+        # Compile or retrieve kernel
+        compiled_kernel = self._get_compiled_kernel(
+            num_groups,
+            total_clusters,
+            initial_tensors,
+            problem_sizes_cute,
+            strides_cute,
+            ptrs_cute,
+            tensormap_cute,
+        )
+
+        # Execute kernel
+        compiled_kernel(
+            *initial_tensors,
+            problem_sizes_cute,
+            strides_cute,
+            ptrs_cute,
+            tensormap_cute,
+            self.stream,
+        )
+
+    def _convert_to_cute_tensors(self, problem_sizes, strides_abc, ptrs_abc, device):
+        """Convert metadata to CUTE tensors using converter."""
+        return self.converter.create_metadata_tensors(
+            problem_sizes, strides_abc, ptrs_abc, device
+        )
+
+    def _get_compiled_kernel(
+        self,
+        num_groups,
+        total_clusters,
+        initial_tensors,
+        problem_sizes_cute,
+        strides_cute,
+        ptrs_cute,
+        tensormap_cute,
+    ):
+        """Get or compile the grouped GEMM kernel."""
+        cache_key = (
+            num_groups,
+            total_clusters,
+            self.use_2cta_instrs,
+            self.mma_tiler_mn,
+            self.cluster_shape_mn,
+            self.use_pretransposed_weights,
+        )
+
+        if cache_key not in self._compiled_kernels:
+            print(
+                f"Compiling CUTLASS kernel: {num_groups} groups, "
+                f"2CTA={self.use_2cta_instrs}, pretransposed={self.use_pretransposed_weights}"
+            )
+
+            self._compiled_kernels[cache_key] = cute.compile(
+                self.grouped_gemm,
+                *initial_tensors,
+                num_groups,
+                problem_sizes_cute,
+                strides_cute,
+                ptrs_cute,
+                total_clusters,
+                tensormap_cute,
+                self.max_active_clusters,
+                self.stream,
+            )
+            print("✅ Kernel compilation successful")
+
+        return self._compiled_kernels[cache_key]
+
+    def _create_initial_tensors(self, problem_shape, device, is_pretransposed):
+        """Create initial CUTE tensors for kernel compilation."""
+        M, N, K, L = problem_shape
+
+        # Create tensors with correct shapes for compilation
+        if is_pretransposed:
+            # For pre-transposed weights, B matrix has shape [K, N]
+            tensors = [
+                torch.randn(M, K, dtype=self.DTYPE_TORCH, device=device),  # A
+                torch.randn(
+                    K, N, dtype=self.DTYPE_TORCH, device=device
+                ),  # B (pre-transposed)
+                torch.zeros(M, N, dtype=self.DTYPE_TORCH, device=device),  # C
+            ]
+        else:
+            # For original layout, B matrix has shape [N, K]
+            tensors = [
+                torch.randn(M, K, dtype=self.DTYPE_TORCH, device=device),  # A
+                torch.randn(
+                    N, K, dtype=self.DTYPE_TORCH, device=device
+                ),  # B (original)
+                torch.zeros(M, N, dtype=self.DTYPE_TORCH, device=device),  # C
+            ]
+
+        # Convert to MNKL format and create CUTE tensors
+        cute_tensors = []
+        for tensor in tensors:
+            mnkl_tensor = tensor.unsqueeze(-1).contiguous()
+            cute_tensor = from_dlpack(mnkl_tensor, assumed_align=self.ALIGNMENT)
+            cute_tensor.element_type = self.DTYPE_CUTLASS
+            cute_tensor = cute_tensor.mark_layout_dynamic(leading_dim=1)
+            cute_tensors.append(cute_tensor)
+
+        return cute_tensors
+
+    def _get_tensormap_buffer(self, device):
+        """Get or create tensormap buffer."""
+        if device not in self._tensormap_buffers:
+            sm_count = self.hardware_info.get_max_active_clusters(1)
+            self._tensormap_buffers[device] = self.converter.create_tensormap_buffer(
+                device, sm_count, self.TENSORMAP_COUNT, self.TENSORMAP_BYTES
+            )
+
+        return self._tensormap_buffers[device]
+
+    def _compute_total_clusters(self, problem_sizes):
+        """Compute total number of clusters needed."""
+        cluster_tile_m = self.mma_tiler_mn[0]
+        cluster_tile_n = self.mma_tiler_mn[1]
+
+        if self.use_2cta_instrs:
+            cluster_tile_m //= 2
+
+        cluster_tile_m *= self.cluster_shape_mn[0]
+        cluster_tile_n *= self.cluster_shape_mn[1]
+
+        total = 0
+        for M, N, K, L in problem_sizes:
+            clusters_m = (M + cluster_tile_m - 1) // cluster_tile_m
+            clusters_n = (N + cluster_tile_n - 1) // cluster_tile_n
+            total += clusters_m * clusters_n
+
+        return total
+
+    def _compute_valid_offsets(self, m_sizes_gpu, m_offsets_gpu, valid_indices, device):
+        """Compute valid offsets for expert operations."""
+        valid_sizes = m_sizes_gpu[valid_indices]
+
+        if len(m_offsets_gpu) > len(valid_indices):
+            return m_offsets_gpu[valid_indices]
+        else:
+            return torch.cumsum(
+                torch.cat([torch.tensor([0], device=device), valid_sizes[:-1]]), dim=0
+            )
+
+    def _apply_activation_and_combine(self, gate_outputs, up_outputs):
+        """Apply activation and combine gate/up outputs."""
+        return [
+            self.activation_function(gate_out) * up_out
+            for gate_out, up_out in zip(gate_outputs, up_outputs)
+        ]
+
+    def _reconstruct_output_gpu(
+        self, final_outputs, m_sizes_gpu, m_offsets_gpu, output
+    ):
+        """Reconstruct the full output tensor."""
+        if not final_outputs:
+            return output
+
+        # Find valid experts
+        valid_mask = m_sizes_gpu > 0
+        valid_indices = torch.nonzero(valid_mask, as_tuple=True)[0]
+        valid_sizes = m_sizes_gpu[valid_indices]
+
+        # Compute offsets
+        valid_offsets = self._compute_valid_offsets(
+            m_sizes_gpu, m_offsets_gpu, valid_indices, m_sizes_gpu.device
+        )
+
+        # Convert to CPU for final reconstruction
+        valid_sizes_cpu = valid_sizes.cpu().tolist()
+        valid_offsets_cpu = valid_offsets.cpu().tolist()
+
+        for i, (size, offset) in enumerate(zip(valid_sizes_cpu, valid_offsets_cpu)):
+            if i < len(final_outputs):
+                output[offset : offset + size] = final_outputs[i]
+
+        return output
+
+    @staticmethod
+    def is_available() -> bool:
+        """Check if CUTLASS is available."""
+        return HAS_CUTLASS
+
+
+# ======= one more version with pre-transposed weights (no runtime transpose) =======
+
+
 class CUTLASSGroupedGemmStrategy_pre_transpose(GroupGEMMStrategy):
     """
     Strategy using CUTLASS GroupedGemmKernel for group GEMM operations on Blackwell architecture.
@@ -1412,6 +2084,7 @@ def _log_initialization(self):
         print(f"   - MMA tiler: {self.mma_tiler_mn}")
         print(f"   - Cluster shape: {self.cluster_shape_mn}")
         print(f"   - Max active clusters: {self.max_active_clusters}")
+        assert False, "we should not be here..."
 
     def arrange_expert_weights(
         self, all_weights: List[torch.Tensor], submod_name: str, module
diff --git a/torchtitan/experiments/deepseek_v3/inference.sh b/torchtitan/experiments/deepseek_v3/inference.sh
index afbab8f20..ae1307eed 100644
--- a/torchtitan/experiments/deepseek_v3/inference.sh
+++ b/torchtitan/experiments/deepseek_v3/inference.sh
@@ -6,6 +6,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+export CUTE_DSL_DISABLE_FILE_CACHING=True
+
 NGPU=${NGPU:-"4"}
 
 # Get the prompt from command line argument or use a default

From e0a064752d9f8ae645ee907b3932f76ab2f9be02 Mon Sep 17 00:00:00 2001
From: lessw2020 <lessw@etrillium.com>
Date: Sun, 22 Jun 2025 11:37:59 -0700
Subject: [PATCH 34/34] improved converter class

---
 .../deepseek_v3/cutlass_grouped_gemm.py       | 879 +++++++++++++++++-
 1 file changed, 878 insertions(+), 1 deletion(-)

diff --git a/torchtitan/experiments/deepseek_v3/cutlass_grouped_gemm.py b/torchtitan/experiments/deepseek_v3/cutlass_grouped_gemm.py
index ad02229d4..b4effcb5c 100644
--- a/torchtitan/experiments/deepseek_v3/cutlass_grouped_gemm.py
+++ b/torchtitan/experiments/deepseek_v3/cutlass_grouped_gemm.py
@@ -44,10 +44,887 @@
 logger = logging.getLogger(__name__)
 
 
+import logging
+from typing import Any, Dict, List, Optional, Tuple
+
+import torch
+
+from .group_gemms import GroupGEMMStrategy
+
+try:
+    import cuda.bindings.driver as cuda
+    import cutlass
+    import cutlass.cute as cute
+    import cutlass.torch as cutlass_torch
+    import cutlass.utils as utils
+    from cutlass.cute.runtime import from_dlpack
+    from torchtitan.experiments.kernels.blackwell.cute_grouped_gemm import (
+        GroupedGemmKernel,
+    )
+
+    HAS_CUTLASS = True
+    print("✓ CUTLASS and strategies imported successfully")
+except ImportError as e:
+    HAS_CUTLASS = False
+    print(f"✗ CUTLASS import failed: {e}")
+    print("CUTLASSGroupedGemmStrategy will not be available")
+
+
+logger = logging.getLogger(__name__)
+
+
+class PyTorchToCuteConverter:
+    """
+    Standalone converter for PyTorch tensors to CUTE tensors.
+
+    """
+
+    # Data type mappings
+    DTYPE_MAP = {
+        torch.float16: cutlass.Float16,
+        torch.bfloat16: cutlass.BFloat16,
+        torch.float32: cutlass.Float32,
+        torch.int8: cutlass.Int8,
+        torch.int32: cutlass.Int32,
+        torch.int64: cutlass.Int64,
+    }
+
+    def __init__(self, alignment: int = 16, acc_dtype=cutlass.Float32):
+        """
+        Initialize the converter.
+
+        Args:
+            alignment: Memory alignment requirement for CUTE tensors
+            acc_dtype: Accumulation data type for CUTLASS operations
+        """
+        self.alignment = alignment
+        self.acc_dtype = acc_dtype
+
+    def get_cutlass_dtype(self, torch_dtype: torch.dtype):
+        """Convert PyTorch dtype to CUTLASS dtype with validation."""
+        if torch_dtype not in self.DTYPE_MAP:
+            raise ValueError(f"Unsupported PyTorch dtype: {torch_dtype}")
+        return self.DTYPE_MAP[torch_dtype]
+
+    def convert_tensor_to_cute(
+        self,
+        tensor: torch.Tensor,
+        make_dynamic: bool = True,
+        dynamic_leading_dim: int = 1,
+    ) -> "cute.Tensor":
+        """
+        Convert PyTorch tensor to CUTE tensor with validation.
+
+        Args:
+            tensor: Input PyTorch tensor
+            make_dynamic: Whether to mark layout as dynamic
+            dynamic_leading_dim: Which dimension to make dynamic
+
+        Returns:
+            CUTE tensor ready for CUTLASS operations
+        """
+        if not tensor.is_contiguous():
+            tensor = tensor.contiguous()
+
+        # Convert to MNKL format if needed
+        if len(tensor.shape) == 2:
+            mnkl_tensor = tensor.unsqueeze(-1).contiguous()
+        else:
+            mnkl_tensor = tensor
+
+        # Create CUTE tensor
+        cute_tensor = from_dlpack(mnkl_tensor, assumed_align=self.alignment)
+        cute_tensor.element_type = self.get_cutlass_dtype(tensor.dtype)
+
+        if make_dynamic:
+            cute_tensor = cute_tensor.mark_layout_dynamic(
+                leading_dim=dynamic_leading_dim
+            )
+
+        return cute_tensor
+
+    def create_metadata_tensors(
+        self,
+        problem_sizes: List[List[int]],
+        strides_abc: List[List[List[int]]],
+        ptrs_abc: List[List[int]],
+        device: torch.device,
+    ) -> Tuple:
+        """
+        Create CUTE tensors for grouped GEMM metadata with validation.
+
+        Args:
+            problem_sizes: List of [M, N, K, L] for each problem
+            strides_abc: List of stride information for A, B, C tensors
+            ptrs_abc: List of data pointers for A, B, C tensors
+            device: Target device
+
+        Returns:
+            Tuple of (problem_sizes_cute, strides_cute, ptrs_cute)
+        """
+        if not problem_sizes:
+            raise ValueError("problem_sizes cannot be empty")
+
+        if not (len(problem_sizes) == len(strides_abc) == len(ptrs_abc)):
+            raise ValueError("All metadata lists must have the same length")
+
+        # Convert to PyTorch tensors with validation
+        try:
+            problem_sizes_tensor = torch.tensor(
+                problem_sizes, dtype=torch.int32, device=device
+            )
+            strides_tensor = torch.tensor(strides_abc, dtype=torch.int32, device=device)
+            ptrs_tensor = torch.tensor(ptrs_abc, dtype=torch.int64, device=device)
+        except Exception as e:
+            raise ValueError(f"Failed to create metadata tensors: {e}")
+
+        # Convert to CUTE tensors
+        return (
+            from_dlpack(problem_sizes_tensor, assumed_align=self.alignment),
+            from_dlpack(strides_tensor, assumed_align=self.alignment),
+            from_dlpack(ptrs_tensor, assumed_align=self.alignment),
+        )
+
+    def create_initial_tensors(
+        self,
+        problem_shape: Tuple[int, int, int, int],
+        device: torch.device,
+        dtype: torch.dtype = torch.bfloat16,
+    ) -> List:
+        """
+        Create initial CUTE tensors for kernel compilation with validation.
+
+        Args:
+            problem_shape: (M, N, K, L) shape tuple
+            device: Target device
+            dtype: PyTorch data type
+
+        Returns:
+            List of CUTE tensors for kernel compilation
+        """
+        M, N, K, L = problem_shape
+
+        if any(dim <= 0 for dim in [M, N, K, L]):
+            raise ValueError(f"Invalid problem shape: {problem_shape}")
+
+        # Create PyTorch tensors
+        tensors = [
+            torch.randn(M, K, dtype=dtype, device=device),  # A
+            torch.randn(N, K, dtype=dtype, device=device),  # B
+            torch.zeros(M, N, dtype=dtype, device=device),  # C
+        ]
+
+        # Convert to CUTE tensors
+        cute_tensors = []
+        for tensor in tensors:
+            cute_tensor = self.convert_tensor_to_cute(tensor)
+            cute_tensors.append(cute_tensor)
+
+        return cute_tensors
+
+    def create_tensormap_buffer(
+        self,
+        device: torch.device,
+        sm_count: int,
+        tensormap_count: int = 3,
+        tensormap_bytes: int = 128,
+    ):
+        """
+        Create tensormap buffer for CUTLASS kernel with validation.
+
+        Args:
+            device: Target device
+            sm_count: Number of streaming multiprocessors
+            tensormap_count: Number of tensormap entries
+            tensormap_bytes: Bytes per tensormap entry
+
+        Returns:
+            CUTE tensor for tensormap buffer
+        """
+        if sm_count <= 0:
+            raise ValueError(f"Invalid sm_count: {sm_count}")
+
+        if tensormap_bytes % 8 != 0:
+            raise ValueError(
+                f"tensormap_bytes must be divisible by 8: {tensormap_bytes}"
+            )
+
+        tensormap_tensor = torch.zeros(
+            (sm_count, tensormap_count, tensormap_bytes // 8),
+            dtype=torch.int64,
+            device=device,
+        )
+
+        return from_dlpack(tensormap_tensor, assumed_align=self.alignment)
+
+
+class ExpertOperationMetadata:
+    """Helper class to manage metadata for individual expert operations."""
+
+    def __init__(
+        self,
+        input_tensor: torch.Tensor,
+        weight_tensor: torch.Tensor,
+        output_tensor: torch.Tensor,
+    ):
+        self.input_tensor = input_tensor.contiguous()
+        self.weight_tensor = weight_tensor.contiguous()
+        self.output_tensor = output_tensor.contiguous()
+
+        # Validate dimensions
+        self._validate_dimensions()
+
+        # Extract shapes
+        self.M, self.K = self.input_tensor.shape
+        self.N = self.weight_tensor.shape[0]  # Assuming [out_features, in_features]
+        self.L = 1
+
+    def _validate_dimensions(self):
+        """Validate tensor dimensions for matrix multiplication."""
+        if len(self.input_tensor.shape) != 2:
+            raise ValueError(
+                f"Input tensor must be 2D, got shape: {self.input_tensor.shape}"
+            )
+
+        if len(self.weight_tensor.shape) != 2:
+            raise ValueError(
+                f"Weight tensor must be 2D, got shape: {self.weight_tensor.shape}"
+            )
+
+        if len(self.output_tensor.shape) != 2:
+            raise ValueError(
+                f"Output tensor must be 2D, got shape: {self.output_tensor.shape}"
+            )
+
+        input_k = self.input_tensor.shape[1]
+        weight_k = self.weight_tensor.shape[1]
+
+        if input_k != weight_k:
+            raise ValueError(
+                f"Matrix multiplication dimension mismatch: "
+                f"input K={input_k} vs weight K={weight_k}"
+            )
+
+    def get_problem_size(self) -> List[int]:
+        """Get problem size in MNKL format."""
+        return [self.M, self.N, self.K, self.L]
+
+    def get_strides(self) -> List[List[int]]:
+        """Get stride information for all tensors."""
+        # Convert to MNKL format for stride extraction
+        input_mnkl = self.input_tensor.unsqueeze(-1)
+        weight_mnkl = self.weight_tensor.unsqueeze(-1)
+        output_mnkl = self.output_tensor.unsqueeze(-1)
+
+        return [
+            list(input_mnkl.stride()[:2]),
+            list(weight_mnkl.stride()[:2]),
+            list(output_mnkl.stride()[:2]),
+        ]
+
+    def get_pointers(self) -> List[int]:
+        """Get data pointers for all tensors."""
+        return [
+            self.input_tensor.data_ptr(),
+            self.weight_tensor.data_ptr(),
+            self.output_tensor.data_ptr(),
+        ]
+
+
+class CUTLASSGroupedGemmStrategy(GroupGEMMStrategy):
+    """
+    Improved CUTLASS GroupedGemmKernel strategy with better tensor conversion.
+
+    This version eliminates CPU-GPU synchronization and provides cleaner tensor
+    management through dedicated converter classes.
+    """
+
+    # Configuration constants
+    SUPPORTED_CLUSTER_SHAPES = [
+        (1, 1),
+        (1, 2),
+        (1, 4),
+        (2, 1),
+        (2, 2),
+        (2, 4),
+        (4, 1),
+        (4, 2),
+        (4, 4),
+    ]
+
+    SINGLE_CTA_M_SIZES = [128, 64]
+    DUAL_CTA_M_SIZES = [256, 128]
+    N_SIZE_RANGE = range(32, 257, 32)
+
+    DTYPE_TORCH = torch.bfloat16
+    DTYPE_CUTLASS = cutlass.BFloat16
+    ACC_DTYPE = cutlass.Float32
+    ALIGNMENT = 16
+    TENSORMAP_COUNT = 3
+    TENSORMAP_BYTES = 128
+
+    def __init__(
+        self,
+        custom_activation,
+        use_2cta_instrs: bool = True,
+        mma_tiler_mn: Optional[Tuple[int, int]] = None,
+        cluster_shape_mn: Optional[Tuple[int, int]] = None,
+    ):
+        """Initialize the improved CUTLASS grouped GEMM strategy."""
+        super().__init__(custom_activation)
+
+        if not HAS_CUTLASS:
+            raise RuntimeError("CUTLASS not available")
+
+        # Set configuration
+        self.use_2cta_instrs = use_2cta_instrs
+        self.mma_tiler_mn = mma_tiler_mn or self._get_default_mma_tiler()
+        self.cluster_shape_mn = cluster_shape_mn or self._get_default_cluster_shape()
+
+        # Initialize converter
+        self.converter = ImprovedPyTorchToCuteConverter(
+            alignment=self.ALIGNMENT, acc_dtype=self.ACC_DTYPE
+        )
+
+        # Initialize kernel and hardware
+        self._initialize_components()
+
+        # Initialize caches
+        self._compiled_kernels = {}
+        self._tensormap_buffers = {}
+
+        self._log_initialization()
+
+    def _get_default_mma_tiler(self) -> Tuple[int, int]:
+        """Get default MMA tiler configuration based on CTA mode."""
+        return (256, 128) if self.use_2cta_instrs else (128, 128)
+
+    def _get_default_cluster_shape(self) -> Tuple[int, int]:
+        """Get default cluster shape based on CTA mode."""
+        return (2, 2) if self.use_2cta_instrs else (1, 1)
+
+    def _initialize_components(self):
+        """Initialize CUTLASS kernel and hardware components."""
+        # Initialize kernel
+        self.grouped_gemm = GroupedGemmKernel(
+            acc_dtype=self.ACC_DTYPE,
+            use_2cta_instrs=self.use_2cta_instrs,
+            mma_tiler_mn=self.mma_tiler_mn,
+            cluster_shape_mn=self.cluster_shape_mn,
+            tensormap_update_mode=utils.TensorMapUpdateMode.SMEM,
+        )
+
+        # Initialize hardware info
+        self.hardware_info = utils.HardwareInfo()
+        self.max_active_clusters = self.hardware_info.get_max_active_clusters(
+            self.cluster_shape_mn[0] * self.cluster_shape_mn[1]
+        )
+
+        # Initialize CUDA stream
+        torch_stream = torch.cuda.current_stream()
+        self.stream = cuda.CUstream(torch_stream.cuda_stream)
+
+    def _log_initialization(self):
+        """Log initialization information."""
+        cluster_size = self.cluster_shape_mn[0] * self.cluster_shape_mn[1]
+        print(f"✅  CUTLASS Blackwell Group Gemm Strategy initialized:")
+        print(f"   - 2 CTA instructions: {self.use_2cta_instrs}")
+        print(f"   - MMA tiler (M, N): {self.mma_tiler_mn}")
+        print(f"   - Cluster shape (M, N): {self.cluster_shape_mn}")
+        print(f"   - Cluster size: {cluster_size}")
+        print(f"   - Max active clusters: {self.max_active_clusters}")
+
+    def arrange_expert_weights(
+        self, all_weights: List[torch.Tensor], submod_name: str, module
+    ) -> torch.Tensor:
+        """Store weights in stacked format."""
+        return torch.stack(all_weights)
+
+    def execute(
+        self,
+        contig_tokens: torch.Tensor,
+        m_sizes: torch.Tensor,
+        m_offsets: torch.Tensor,
+        module,
+    ) -> torch.Tensor:
+        """
+        Execute using improved CUTLASS grouped GEMM with better tensor management.
+
+        Args:
+            contig_tokens: Input tokens arranged contiguously by expert
+            m_sizes: Tensor of expert sizes (GPU tensor to avoid sync)
+            m_offsets: Tensor of expert offsets (GPU tensor to avoid sync)
+            module: MoE module containing weights
+        """
+        try:
+            # Ensure GPU tensors and validate inputs
+            m_sizes_gpu, m_offsets_gpu = self._prepare_gpu_tensors(
+                m_sizes, m_offsets, contig_tokens.device
+            )
+            self._validate_inputs(contig_tokens, m_sizes_gpu, module)
+
+            # Get weights and device
+            weights = self._get_weights(module)
+            device = contig_tokens.device
+
+            # Prepare output tensor
+            output = torch.zeros(
+                contig_tokens.shape[0],
+                weights["gate"].shape[2],
+                dtype=self.DTYPE_TORCH,
+                device=device,
+            )
+
+            # Early exit if no valid experts
+            if not self._has_valid_experts_gpu(m_sizes_gpu):
+                return output
+
+            # Execute three-stage MoE computation
+            gate_outputs, up_outputs = self._execute_gate_up_projections(
+                contig_tokens,
+                weights["gate"],
+                weights["up"],
+                m_sizes_gpu,
+                m_offsets_gpu,
+                device,
+            )
+
+            hidden_states = self._apply_activation_and_combine(gate_outputs, up_outputs)
+
+            final_outputs = self._execute_down_projection(
+                hidden_states, weights["down"], m_sizes_gpu, device
+            )
+
+            return self._reconstruct_output_gpu(
+                final_outputs, m_sizes_gpu, m_offsets_gpu, output
+            )
+
+        except Exception as e:
+            logger.error(f"Error in CUTLASS execution: {e}")
+            raise
+
+    def _prepare_gpu_tensors(
+        self, m_sizes, m_offsets, device
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Ensure sizes and offsets are GPU tensors with validation."""
+        if not isinstance(m_sizes, torch.Tensor):
+            m_sizes_gpu = torch.tensor(m_sizes, dtype=torch.int32, device=device)
+        else:
+            m_sizes_gpu = m_sizes.to(device=device, dtype=torch.int32)
+
+        if not isinstance(m_offsets, torch.Tensor):
+            m_offsets_gpu = torch.tensor(m_offsets, dtype=torch.int32, device=device)
+        else:
+            m_offsets_gpu = m_offsets.to(device=device, dtype=torch.int32)
+
+        return m_sizes_gpu, m_offsets_gpu
+
+    def _validate_inputs(
+        self, contig_tokens: torch.Tensor, m_sizes_gpu: torch.Tensor, module
+    ):
+        """Validate input parameters with comprehensive checks."""
+        if contig_tokens.dtype != self.DTYPE_TORCH:
+            raise ValueError(
+                f"Expected input dtype {self.DTYPE_TORCH}, got {contig_tokens.dtype}"
+            )
+
+        if len(contig_tokens.shape) != 2:
+            raise ValueError(
+                f"Expected 2D input tensor, got shape {contig_tokens.shape}"
+            )
+
+        required_params = ["gate_proj_weight", "up_proj_weight", "down_proj_weight"]
+        for param in required_params:
+            if not hasattr(module, param) or module.get_parameter(param) is None:
+                raise ValueError(f"Module missing required parameter: {param}")
+
+    def _has_valid_experts_gpu(self, m_sizes_gpu: torch.Tensor) -> bool:
+        """Check if any experts have tokens using GPU operations."""
+        return torch.any(m_sizes_gpu > 0).item()
+
+    def _get_weights(self, module) -> Dict[str, torch.Tensor]:
+        """Extract and return weight tensors from module."""
+        return {
+            "gate": module.get_parameter("gate_proj_weight"),
+            "up": module.get_parameter("up_proj_weight"),
+            "down": module.get_parameter("down_proj_weight"),
+        }
+
+    def _execute_gate_up_projections(
+        self,
+        input_tokens: torch.Tensor,
+        gate_weights: torch.Tensor,
+        up_weights: torch.Tensor,
+        m_sizes_gpu: torch.Tensor,
+        m_offsets_gpu: torch.Tensor,
+        device: torch.device,
+    ) -> Tuple[List, List]:
+        """Execute gate and up projections using improved tensor management."""
+        # Find valid experts
+        valid_mask = m_sizes_gpu > 0
+        valid_indices = torch.nonzero(valid_mask, as_tuple=True)[0]
+
+        if len(valid_indices) == 0:
+            return [], []
+
+        # Prepare metadata using improved helper
+        operations_metadata = self._prepare_projection_metadata(
+            input_tokens,
+            gate_weights,
+            up_weights,
+            m_sizes_gpu,
+            m_offsets_gpu,
+            valid_indices,
+            device,
+        )
+
+        if not operations_metadata:
+            return [], []
+
+        # Execute grouped GEMM
+        self._execute_grouped_gemm_with_metadata(operations_metadata, device)
+
+        # Extract outputs
+        gate_outputs = [op["gate_output"] for op in operations_metadata]
+        up_outputs = [op["up_output"] for op in operations_metadata]
+
+        return gate_outputs, up_outputs
+
+    def _prepare_projection_metadata(
+        self,
+        input_tokens: torch.Tensor,
+        gate_weights: torch.Tensor,
+        up_weights: torch.Tensor,
+        m_sizes_gpu: torch.Tensor,
+        m_offsets_gpu: torch.Tensor,
+        valid_indices: torch.Tensor,
+        device: torch.device,
+    ) -> List[Dict]:
+        """Prepare metadata for projections using improved helpers."""
+        operations_metadata = []
+
+        # Extract valid information
+        valid_sizes = m_sizes_gpu[valid_indices]
+        valid_offsets = self._compute_valid_offsets(
+            m_sizes_gpu, m_offsets_gpu, valid_indices, device
+        )
+
+        # Convert to CPU for iteration (minimal sync)
+        valid_sizes_cpu = valid_sizes.cpu().tolist()
+        valid_offsets_cpu = valid_offsets.cpu().tolist()
+        valid_indices_cpu = valid_indices.cpu().tolist()
+
+        for expert_idx, size, offset in zip(
+            valid_indices_cpu, valid_sizes_cpu, valid_offsets_cpu
+        ):
+            if size > 0:
+                # Get expert data
+                expert_tokens = input_tokens[offset : offset + size].contiguous()
+                gate_weight = gate_weights[expert_idx].contiguous()
+                up_weight = up_weights[expert_idx].contiguous()
+
+                M, K = expert_tokens.shape
+                N = gate_weight.shape[0]
+
+                # Create output tensors
+                gate_output = torch.empty(M, N, dtype=self.DTYPE_TORCH, device=device)
+                up_output = torch.empty(M, N, dtype=self.DTYPE_TORCH, device=device)
+
+                # Create metadata for both projections using helper class
+                gate_metadata = ExpertOperationMetadata(
+                    expert_tokens, gate_weight, gate_output
+                )
+                up_metadata = ExpertOperationMetadata(
+                    expert_tokens, up_weight, up_output
+                )
+
+                operations_metadata.append(
+                    {
+                        "gate_metadata": gate_metadata,
+                        "up_metadata": up_metadata,
+                        "gate_output": gate_output,
+                        "up_output": up_output,
+                    }
+                )
+
+        return operations_metadata
+
+    def _execute_down_projection(
+        self,
+        hidden_states: List[torch.Tensor],
+        down_weights: torch.Tensor,
+        m_sizes_gpu: torch.Tensor,
+        device: torch.device,
+    ) -> List[torch.Tensor]:
+        """Execute down projection using improved tensor management."""
+        if not hidden_states:
+            return []
+
+        # Find valid experts
+        valid_mask = m_sizes_gpu > 0
+        valid_indices = torch.nonzero(valid_mask, as_tuple=True)[0]
+        valid_indices_cpu = valid_indices.cpu().tolist()
+
+        # Prepare down projection metadata
+        down_operations = []
+        for i, expert_idx in enumerate(valid_indices_cpu):
+            if i < len(hidden_states):
+                hidden = hidden_states[i]
+                down_weight = down_weights[expert_idx].contiguous()
+
+                M, K = hidden.shape
+                N = down_weight.shape[0]
+                down_output = torch.empty(M, N, dtype=self.DTYPE_TORCH, device=device)
+
+                # Create metadata using helper class
+                down_metadata = ExpertOperationMetadata(
+                    hidden, down_weight, down_output
+                )
+                down_operations.append(
+                    {
+                        "metadata": down_metadata,
+                        "output": down_output,
+                    }
+                )
+
+        if not down_operations:
+            return []
+
+        # Execute grouped GEMM for down projection
+        self._execute_grouped_gemm_for_down(down_operations, device)
+
+        return [op["output"] for op in down_operations]
+
+    def _execute_grouped_gemm_with_metadata(
+        self, operations_metadata: List[Dict], device: torch.device
+    ):
+        """Execute grouped GEMM using operations metadata."""
+        # Collect all metadata for both gate and up projections
+        all_problem_sizes = []
+        all_strides = []
+        all_ptrs = []
+
+        for op in operations_metadata:
+            # Add gate projection
+            gate_meta = op["gate_metadata"]
+            all_problem_sizes.append(gate_meta.get_problem_size())
+            all_strides.append(gate_meta.get_strides())
+            all_ptrs.append(gate_meta.get_pointers())
+
+            # Add up projection
+            up_meta = op["up_metadata"]
+            all_problem_sizes.append(up_meta.get_problem_size())
+            all_strides.append(up_meta.get_strides())
+            all_ptrs.append(up_meta.get_pointers())
+
+        if not all_problem_sizes:
+            return
+
+        # Execute using improved converter
+        self._execute_cutlass_kernel(all_problem_sizes, all_strides, all_ptrs, device)
+
+    def _execute_grouped_gemm_for_down(
+        self, down_operations: List[Dict], device: torch.device
+    ):
+        """Execute grouped GEMM for down projection."""
+        all_problem_sizes = []
+        all_strides = []
+        all_ptrs = []
+
+        for op in down_operations:
+            metadata = op["metadata"]
+            all_problem_sizes.append(metadata.get_problem_size())
+            all_strides.append(metadata.get_strides())
+            all_ptrs.append(metadata.get_pointers())
+
+        if not all_problem_sizes:
+            return
+
+        # Execute using improved converter
+        self._execute_cutlass_kernel(all_problem_sizes, all_strides, all_ptrs, device)
+
+    def _execute_cutlass_kernel(
+        self,
+        problem_sizes: List[List[int]],
+        strides_abc: List[List[List[int]]],
+        ptrs_abc: List[List[int]],
+        device: torch.device,
+    ):
+        """Execute CUTLASS kernel using improved converter."""
+        num_groups = len(problem_sizes)
+
+        # Convert to CUTE tensors using improved converter
+        problem_sizes_cute, strides_cute, ptrs_cute = (
+            self.converter.create_metadata_tensors(
+                problem_sizes, strides_abc, ptrs_abc, device
+            )
+        )
+
+        # Get other required components
+        tensormap_cute = self._get_tensormap_buffer(device)
+        total_clusters = self._compute_total_clusters(problem_sizes)
+
+        # Get initial tensors for compilation using improved converter
+        initial_tensors = self.converter.create_initial_tensors(
+            tuple(problem_sizes[0]), device, self.DTYPE_TORCH
+        )
+
+        # Compile or retrieve kernel
+        compiled_kernel = self._get_compiled_kernel(
+            num_groups,
+            total_clusters,
+            initial_tensors,
+            problem_sizes_cute,
+            strides_cute,
+            ptrs_cute,
+            tensormap_cute,
+        )
+
+        # Execute kernel
+        compiled_kernel(
+            *initial_tensors,
+            problem_sizes_cute,
+            strides_cute,
+            ptrs_cute,
+            tensormap_cute,
+            self.stream,
+        )
+
+    def _compute_valid_offsets(
+        self,
+        m_sizes_gpu: torch.Tensor,
+        m_offsets_gpu: torch.Tensor,
+        valid_indices: torch.Tensor,
+        device: torch.device,
+    ) -> torch.Tensor:
+        """Compute valid offsets for expert operations."""
+        valid_sizes = m_sizes_gpu[valid_indices]
+
+        if len(m_offsets_gpu) > len(valid_indices):
+            return m_offsets_gpu[valid_indices]
+        else:
+            return torch.cumsum(
+                torch.cat([torch.tensor([0], device=device), valid_sizes[:-1]]), dim=0
+            )
+
+    def _get_tensormap_buffer(self, device: torch.device):
+        """Get or create tensormap buffer using improved converter."""
+        if device not in self._tensormap_buffers:
+            sm_count = self.hardware_info.get_max_active_clusters(1)
+            self._tensormap_buffers[device] = self.converter.create_tensormap_buffer(
+                device, sm_count, self.TENSORMAP_COUNT, self.TENSORMAP_BYTES
+            )
+        return self._tensormap_buffers[device]
+
+    def _compute_total_clusters(self, problem_sizes: List[List[int]]) -> int:
+        """Compute total number of clusters needed."""
+        cluster_tile_m = self.mma_tiler_mn[0]
+        cluster_tile_n = self.mma_tiler_mn[1]
+
+        if self.use_2cta_instrs:
+            cluster_tile_m //= 2
+
+        cluster_tile_m *= self.cluster_shape_mn[0]
+        cluster_tile_n *= self.cluster_shape_mn[1]
+
+        total = 0
+        for M, N, K, L in problem_sizes:
+            clusters_m = (M + cluster_tile_m - 1) // cluster_tile_m
+            clusters_n = (N + cluster_tile_n - 1) // cluster_tile_n
+            total += clusters_m * clusters_n
+
+        return total
+
+    def _get_compiled_kernel(
+        self,
+        num_groups: int,
+        total_clusters: int,
+        initial_tensors: List,
+        problem_sizes_cute,
+        strides_cute,
+        ptrs_cute,
+        tensormap_cute,
+    ):
+        """Get or compile the grouped GEMM kernel with caching."""
+        cache_key = (
+            num_groups,
+            total_clusters,
+            self.use_2cta_instrs,
+            self.mma_tiler_mn,
+            self.cluster_shape_mn,
+        )
+
+        if cache_key not in self._compiled_kernels:
+            print(
+                f"Compiling CUTLASS kernel: {num_groups} groups, 2CTA={self.use_2cta_instrs}"
+            )
+
+            self._compiled_kernels[cache_key] = cute.compile(
+                self.grouped_gemm,
+                *initial_tensors,
+                num_groups,
+                problem_sizes_cute,
+                strides_cute,
+                ptrs_cute,
+                total_clusters,
+                tensormap_cute,
+                self.max_active_clusters,
+                self.stream,
+            )
+            print("✅ Kernel compilation successful")
+
+        return self._compiled_kernels[cache_key]
+
+    def _apply_activation_and_combine(
+        self, gate_outputs: List[torch.Tensor], up_outputs: List[torch.Tensor]
+    ) -> List[torch.Tensor]:
+        """Apply activation and combine gate/up outputs."""
+        return [
+            self.activation_function(gate_out) * up_out
+            for gate_out, up_out in zip(gate_outputs, up_outputs)
+        ]
+
+    def _reconstruct_output_gpu(
+        self,
+        final_outputs: List[torch.Tensor],
+        m_sizes_gpu: torch.Tensor,
+        m_offsets_gpu: torch.Tensor,
+        output: torch.Tensor,
+    ) -> torch.Tensor:
+        """Reconstruct the full output tensor using GPU operations."""
+        if not final_outputs:
+            return output
+
+        # Find valid experts
+        valid_mask = m_sizes_gpu > 0
+        valid_indices = torch.nonzero(valid_mask, as_tuple=True)[0]
+        valid_sizes = m_sizes_gpu[valid_indices]
+
+        # Compute offsets
+        valid_offsets = self._compute_valid_offsets(
+            m_sizes_gpu, m_offsets_gpu, valid_indices, m_sizes_gpu.device
+        )
+
+        # Convert to CPU for final reconstruction (minimal sync)
+        valid_sizes_cpu = valid_sizes.cpu().tolist()
+        valid_offsets_cpu = valid_offsets.cpu().tolist()
+
+        for i, (size, offset) in enumerate(zip(valid_sizes_cpu, valid_offsets_cpu)):
+            if i < len(final_outputs):
+                output[offset : offset + size] = final_outputs[i]
+
+        return output
+
+    @staticmethod
+    def is_available() -> bool:
+        """Check if CUTLASS is available."""
+        return HAS_CUTLASS
+
+
 # ================== current working version ==================
 
 
-class CUTLASSGroupedGemmStrategy(GroupGEMMStrategy):
+class CUTLASSGroupedGemmStrategy_working_backup(GroupGEMMStrategy):
     """
     Strategy using CUTLASS GroupedGemmKernel for group GEMM operations on Blackwell architecture.