Change wint2 to ColumnMajorTileInterleave.

Xreki · Xreki · commit feac9566a034 · 2025-07-02T17:58:23.000+08:00
Change-Id: I593cbe36f991c0c5044989d65f0014087587c624
diff --git a/custom_ops/gpu_ops/cutlass_extensions/gemm/kernel/mixed_gemm_B_layout.h b/custom_ops/gpu_ops/cutlass_extensions/gemm/kernel/mixed_gemm_B_layout.h
@@ -133,10 +133,18 @@ template <typename TypeA, typename Arch>
 template <typename TypeA, typename Arch>
 struct LayoutDetailsB<TypeA, uint2b_t, Arch, typename platform::enable_if<Arch::kMinComputeCapability >= 75>::type>
 {
-    static constexpr int ThreadblockK = 128 * 8 / cutlass::sizeof_bits<TypeA>::value;
-    using Layout = layout::ColumnMajor;
-    static constexpr int ElementsPerAccess = 8; // at least 4-bytes
-    using Operator = cutlass::arch::OpMultiplyAdd;
+    static constexpr int ThreadblockK = 128 * 8 / cutlass::sizeof_bits<TypeA>::value; // 64
+
+private:
+    static constexpr int ElementsPerCacheLine = 128 * 8 / sizeof_bits<uint2b_t>::value;
+    static constexpr int ColumnsInterleaved = ElementsPerCacheLine / ThreadblockK; // 8
+
+public:
+    // using Layout = layout::ColumnMajor;
+    // static constexpr int ElementsPerAccess = 16; // at least 4-bytes
+    using Layout = layout::ColumnMajorTileInterleave<ThreadblockK, ColumnsInterleaved>;
+    static constexpr int ElementsPerAccess = 128 / cutlass::sizeof_bits<uint2b_t>::value; // 64
+    using Operator = cutlass::arch::OpMultiplyAddDequantizeInterleavedBToA;
 };
 
 template <typename TypeA, typename Arch>
diff --git a/custom_ops/gpu_ops/cutlass_extensions/gemm/threadblock/default_wint2x_mma.h b/custom_ops/gpu_ops/cutlass_extensions/gemm/threadblock/default_wint2x_mma.h
@@ -106,6 +106,9 @@ struct DefaultWint2xMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlig
     static_assert(platform::is_same<ElementB, uint2b_t>::value,
         "Element B must be uint2b_t");
 
+    static_assert(platform::is_same<Operator, arch::OpMultiplyAddDequantizeInterleavedBToA>::value,
+        "Mma multistage must dequantize after ldsm");
+
     static cutlass::arch::CacheOperation::Kind const CacheOpA = ((sizeof_bits<ElementA>::value * kAlignmentA) == 128)
         ? cutlass::arch::CacheOperation::Global
         : cutlass::arch::CacheOperation::Always;
@@ -117,8 +120,8 @@ struct DefaultWint2xMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlig
     // Define the MmaCore components
     // Mma core does not depend on stages, so pass in at least 3 here to mma multistage pieces are created
     using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<ThreadblockShape, WarpShape, InstructionShape,
-        ElementA, LayoutA, ElementA, LayoutB, ElementAccumulator, layout::RowMajor, OperatorClass, std::max(kStages, 3),
-        Operator, false, CacheOpA, CacheOpB>;
+        ElementA, LayoutA, ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, OperatorClass,
+        std::max(kStages, 3), Operator, false, CacheOpA, CacheOpB>;
 
     // Define iterators over tiles from the A operand
     using ThreadMapA = typename MmaCore::IteratorThreadMapA;
@@ -127,17 +130,39 @@ struct DefaultWint2xMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlig
         cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>, ElementA, LayoutA, 1, ThreadMapA,
         AccessTypeA>;
 
-    // Define iterators over tiles from the B operand
+private:
+    static constexpr int kColumnsInterleaved = LayoutB::kColumnsInterleaved;
+    static constexpr int kRowsPerTile = LayoutB::kRowsPerTile;
+    static_assert(!(MmaCore::Shape::kN % kColumnsInterleaved), "ThreadblockShape must be disivle by kColumnsInterleaved");
+    static_assert(kRowsPerTile == MmaCore::Shape::kK, "");
+
     using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+    using WarpArrangement = typename ThreadMapB::Detail::WarpThreadArrangement;
+    static_assert(!(WarpArrangement::kStrided % kColumnsInterleaved), "");
+
+    using IteratorShapeB = MatrixShape<
+        MmaCore::Shape::kK * kColumnsInterleaved, MmaCore::Shape::kN / kColumnsInterleaved>;
+    using InterleavedThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+        layout::PitchLinearShape<IteratorShapeB::kRow, IteratorShapeB::kColumn>,
+        ThreadMapB::kThreads,
+        layout::PitchLinearShape<WarpArrangement::kContiguous * kColumnsInterleaved,
+            WarpArrangement::kStrided / kColumnsInterleaved>,
+        MmaCore::kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+public:
+    // Define iterators over tiles from the B operand
     using AccessTypeB = cutlass::Array<ElementB, kAlignmentB>;
     using IteratorB = cutlass::transform::threadblock::PredicatedTileAccessIterator<
-        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>, ElementB, LayoutB, 0, ThreadMapB,
+        IteratorShapeB, ElementB, layout::ColumnMajor, 0, InterleavedThreadMapB,
         AccessTypeB>;
 
     // Define the threadblock-scoped multistage matrix multiply
-    using ThreadblockMma = cutlass::gemm::threadblock::Wint2xMmaMultistage<typename MmaCore::Shape, IteratorA,
-        typename MmaCore::SmemIteratorA, MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
-        MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor, typename MmaCore::MmaPolicy, kStages, SharedMemoryClear>;
+    using ThreadblockMma = cutlass::gemm::threadblock::Wint2xMmaMultistage<
+        typename MmaCore::Shape,
+        IteratorA, typename MmaCore::SmemIteratorA, MmaCore::kCacheOpA,
+        IteratorB, typename MmaCore::SmemIteratorB, MmaCore::kCacheOpB,
+        ElementAccumulator, layout::RowMajor,
+        typename MmaCore::MmaPolicy, kStages, SharedMemoryClear>;
 };
 
 } // namespace threadblock
diff --git a/custom_ops/gpu_ops/cutlass_extensions/gemm/threadblock/wint2x_mma_multistage.h b/custom_ops/gpu_ops/cutlass_extensions/gemm/threadblock/wint2x_mma_multistage.h
@@ -266,10 +266,9 @@ class Wint2xMmaMultistage :
     if (smem_read_stage_idx_ == Base::kStages) {
       // Wrap back around to the 'start' of the circular buffer in shared memory
       this->warp_tile_iterator_A_.add_tile_offset({0, -Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations});
-      // this->warp_tile_iterator_B_.add_tile_offset({-Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations, 0});
+      this->warp_tile_iterator_B_.add_tile_offset({-Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations, 0});
       smem_read_stage_idx_ = 0;
     }
-    this->warp_tile_iterator_B_.add_tile_offset({-Policy::kPartitionsK * Base::kWarpGemmIterations, 0});
   }
 
   /// Advance global memory read-iterators and shared memory write-iterators to the stage
@@ -566,16 +565,6 @@ class Wint2xMmaMultistage :
       this->warp_tile_iterator_A_.load(pipe_state.warp_loaded_frag_A_[(warp_mma_k + 1) % 2]);
       ++this->warp_tile_iterator_A_;
 
-      if (warp_mma_k + 1 == Base::kWarpGemmIterations) {
-        // Unpack and dequant the first stage of B.
-        int unpack_stage = stage - Base::kStages + 2;
-        //tile_dequanter_B.UnpackAndDequant(smem_zipped_ptr_B_ + (unpack_stage % Base::kStages) * smem_zipped_bytes_per_stage_B_,
-        //                                  column_wise_smem_ptr_B_, unpack_stage);
-
-        // Copy dequatized data to shared memory used by mma core.
-        //copy_tiles_and_advance_per_stage_B<false, false>(iterator_B);
-      }
-
       // Load the next warp-tile's B fragment from shared memory
       this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
       this->warp_tile_iterator_B_.load(pipe_state.warp_loaded_frag_B_[(warp_mma_k + 1) % 2]);
@@ -617,13 +606,10 @@ class Wint2xMmaMultistage :
       // global->shared fragment copies
       if (warp_mma_k < Base::kWarpGemmIterations - 1) {
         int group_start_iteration_A = warp_mma_k * Detail::kAccessesPerGroupA;
+        int group_start_iteration_B = warp_mma_k * Detail::kAccessesPerGroupB;
 
         copy_tiles_and_advance_A(iterator_A, group_start_iteration_A);
-
-        if (warp_mma_k == 0) {
-          tile_dequanter_B.Load(smem_zipped_ptr_B_ + (stage % Base::kStages) * smem_zipped_bytes_per_stage_B_,
-                                column_wise_smem_ptr_B_, stage);
-        }
+        copy_tiles_and_advance_B<false>(iterator_B, group_start_iteration_B);
       }
 
       // The second-to-last warp-tile also:
@@ -632,8 +618,10 @@ class Wint2xMmaMultistage :
       if (warp_mma_k + 2 == Base::kWarpGemmIterations) {
         // Performs the last warp-tile's share of global->shared fragment copies
         int group_start_iteration_A = (warp_mma_k + 1) * Detail::kAccessesPerGroupA;
+        int group_start_iteration_B = (warp_mma_k + 1) * Detail::kAccessesPerGroupB;
 
         copy_tiles_and_advance_A(iterator_A, group_start_iteration_A);
+        copy_tiles_and_advance_B<false>(iterator_B, group_start_iteration_B);
 
         // Inserts a memory fence between stages of cp.async instructions.
         cutlass::arch::cp_async_fence();
@@ -648,7 +636,7 @@ class Wint2xMmaMultistage :
         // Disable global fetching when done with global fetch iterations
         --gemm_k_iterations;
         iterator_A.clear_mask(gemm_k_iterations == 0);
-        iterator_B.clear_mask(gemm_k_iterations == (-Base::kStages + 1));
+        iterator_B.clear_mask(gemm_k_iterations == 0);
       }
 
       // The last warp-tile also converts the shared memory fragments used by
@@ -675,12 +663,8 @@ class Wint2xMmaMultistage :
       IteratorB &iterator_B,
       TileDequanterB &tile_dequanter_B)        ///< [in|out] iterator over B operand in global memory
   {
-#if 0
     PipeState pipe_state;
 
-    // Unpack and dequant the first stage of B.
-    //tile_dequanter_B.UnpackAndDequant(smem_zipped_ptr_B_, column_wise_smem_ptr_B_, 0);
-
     // Disable global fetching if done with global fetch iterations
     iterator_A.clear_mask(gemm_k_iterations == 0);
     iterator_B.clear_mask(gemm_k_iterations == (-Base::kStages + 1));
@@ -690,9 +674,6 @@ class Wint2xMmaMultistage :
     this->warp_tile_iterator_A_.load(pipe_state.warp_loaded_frag_A_[0]);
     ++this->warp_tile_iterator_A_;
 
-    // Copy dequatized data to shared memory used by mma core.
-    //copy_tiles_and_advance_per_stage_B<false, true>(iterator_B);
-
     // Load first warp-tile's B fragment from shared memory
     this->warp_tile_iterator_B_.set_kgroup_index(0);
     this->warp_tile_iterator_B_.load(pipe_state.warp_loaded_frag_B_[0]);
@@ -709,6 +690,7 @@ class Wint2xMmaMultistage :
       pipe_state.tmp_accum_.clear();
     }
 
+#if 0
     int stage = Base::kStages - 1;
 
     // Mainloop
@@ -723,6 +705,7 @@ class Wint2xMmaMultistage :
         gemm_k_iterations,
         stage);
       stage += 1;
+      break;
     }
 
     if (Detail::kStagedAccumulation) {
@@ -766,8 +749,7 @@ class Wint2xMmaMultistage :
     else
     {
       this->warp_tile_iterator_A_.add_tile_offset({0, ((Base::kStages - 2) * kStageIters)});
-      //this->warp_tile_iterator_B_.add_tile_offset({((Base::kStages - 2) * kStageIters), 0});
-      this->warp_tile_iterator_B_.add_tile_offset({(-2 * kStageIters), 0});
+      this->warp_tile_iterator_B_.add_tile_offset({((Base::kStages - 2) * kStageIters), 0});
     }
     smem_read_stage_idx_ = smem_write_stage_idx_;
   }
diff --git a/custom_ops/gpu_ops/cutlass_extensions/gemm/warp/default_mma_tensor_op.h b/custom_ops/gpu_ops/cutlass_extensions/gemm/warp/default_mma_tensor_op.h
@@ -41,12 +41,9 @@
 #include "cutlass_extensions/arch/mma.h"
 #include "cutlass_extensions/gemm/warp/mma_tensorop_compute_B_with_f16.h"
 
-namespace cutlass
-{
-namespace gemm
-{
-namespace warp
-{
+namespace cutlass {
+namespace gemm {
+namespace warp {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -81,7 +78,7 @@ struct DefaultMmaTensorOp<WarpShape_, InstructionShape_, ElementA, LayoutA, Elem
     // Shape for computing the FP16s
     using ComputeInstructionShape = InstructionShape_;
 
-    // Chosen so we get K=16 for int8 and K=32 for int4.
+    // Chosen so we get K=16 for int8, K=32 for int4, K=64 for int2.
     static constexpr int LoadInstructionK = 128 / sizeof_bits<ElementB>::value;
 
     // Shape for loading the narrow data type from shared memory
diff --git a/custom_ops/gpu_ops/cutlass_extensions/gemm/warp/mma_tensorop_compute_B_with_f16.h b/custom_ops/gpu_ops/cutlass_extensions/gemm/warp/mma_tensorop_compute_B_with_f16.h
@@ -295,6 +295,11 @@ class MmaTensorOpComputeBWithF16
         assert(0);
 #endif
     }
+
+  /// Transform the mma operands to the required types
+  CUTLASS_DEVICE
+  void transform(TransformedFragmentA &dst_A, TransformedFragmentB &dst_B,
+                 FragmentA const &A, FragmentB const &B) const {}
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/custom_ops/gpu_ops/cutlass_kernels/moe_gemm/fused_moe_gemm_kernels_template.h b/custom_ops/gpu_ops/cutlass_kernels/moe_gemm/fused_moe_gemm_kernels_template.h
@@ -715,8 +715,8 @@ void MoeGemmRunner<T, WeightQuantTraits>::run_gemm<EpilogueTag>(
   std::vector<CutlassGemmConfig> candidate_configs =
       get_candidate_configs(sm_, -1, is_weight_only, only_simt_configs, true);
 
-  static constexpr int warm_time = 5;
-  static constexpr int test_time = 10;
+  static constexpr int warm_time = 0;
+  static constexpr int test_time = 1;
   auto& gemmConfigManager = GemmConfigManager::Instance();
   constexpr GemmDataType dtype = getGemmDataType<T>();
   constexpr GemmDataType wdtype = getGemmDataType<WeightType>();
@@ -735,8 +735,10 @@ void MoeGemmRunner<T, WeightQuantTraits>::run_gemm<EpilogueTag>(
         std::min(gemmConfigManager.nextPowerOfTwo(actual_total_rows),
                  gemmConfigManager.getMaxProfileM());
     bool find_one = false;
-    size_t num_candidate_configs_size = candidate_configs.size();
-    for (size_t ii = 0; ii < num_candidate_configs_size; ++ii) {
+    size_t num_candidate_configs_size = 2;//candidate_configs.size();
+    // for (size_t ii = 0; ii < num_candidate_configs_size; ++ii)
+    {
+      size_t ii = 1;
       try {
         for (int i = 0; i < warm_time; i++) {
           dispatch_to_arch<EpilogueTag>(A,
@@ -780,7 +782,7 @@ void MoeGemmRunner<T, WeightQuantTraits>::run_gemm<EpilogueTag>(
         check_cuda_error(cudaEventElapsedTime(&elapsed, start, stop));
         check_cuda_error(cudaEventDestroy(start));
         check_cuda_error(cudaEventDestroy(stop));
-        //std::cout << "[TUNING] config: " << ii << ", time: " << elapsed << " ms" << std::endl;
+        std::cout << "[TUNING] config: " << ii << ", time: " << elapsed << " ms" << std::endl;
         if (elapsed < best_time) {
           best_id = ii;
           best_time = elapsed;
@@ -801,6 +803,7 @@ void MoeGemmRunner<T, WeightQuantTraits>::run_gemm<EpilogueTag>(
     }
   }
 
+#if 0
   dispatch_to_arch<EpilogueTag>(A,
                                 B,
                                 weight_scales,
@@ -814,6 +817,7 @@ void MoeGemmRunner<T, WeightQuantTraits>::run_gemm<EpilogueTag>(
                                 quant_args_B,
                                 chosen_config,
                                 stream);
+#endif
 }
 
 template <typename T, typename WeightQuantTraits>
diff --git a/custom_ops/gpu_ops/moe/moe_ffn_wint2.cu b/custom_ops/gpu_ops/moe/moe_ffn_wint2.cu
@@ -20,7 +20,7 @@
 #include "moe/fast_hardamard_kernel.h"
 #include "moe/fused_moe_helper.h"
 
-#define _GROUP_GEMM_ONLY 0
+#define _GROUP_GEMM_ONLY 1
 
 template <typename DataT, typename NvType, typename WeightSavedT, cutlass::WintQuantMethod QuantMethod>
 void WeightOnlyMoeFFNKernel(const paddle::Tensor& permute_input,