Change wint2 to ColumnMajor.

Xreki · Xreki · commit fbd86c85581b · 2025-07-01T14:57:32.000+08:00
Change-Id: I6b44d02946a685f8fe24d9f2c7be258b51e16da2
diff --git a/custom_ops/gpu_ops/cutlass_extensions/gemm/kernel/mixed_gemm_B_layout.h b/custom_ops/gpu_ops/cutlass_extensions/gemm/kernel/mixed_gemm_B_layout.h
@@ -134,8 +134,8 @@ template <typename TypeA, typename Arch>
 struct LayoutDetailsB<TypeA, uint2b_t, Arch, typename platform::enable_if<Arch::kMinComputeCapability >= 75>::type>
 {
     static constexpr int ThreadblockK = 128 * 8 / cutlass::sizeof_bits<TypeA>::value;
-    using Layout = layout::RowMajor;
-    static constexpr int ElementsPerAccess = 128 / cutlass::sizeof_bits<TypeA>::value;
+    using Layout = layout::ColumnMajor;
+    static constexpr int ElementsPerAccess = 8; // at least 4-bytes
     using Operator = cutlass::arch::OpMultiplyAdd;
 };
 
diff --git a/custom_ops/gpu_ops/cutlass_extensions/gemm/threadblock/default_mma.h b/custom_ops/gpu_ops/cutlass_extensions/gemm/threadblock/default_mma.h
@@ -383,7 +383,7 @@ struct DefaultMma<cutlass::half_t, LayoutA, kAlignmentA, uint2b_t, LayoutB, kAli
             : cutlass::arch::CacheOperation::Always;
 
     static cutlass::arch::CacheOperation::Kind const CacheOpB =
-        ((sizeof_bits<half_t>::value * kAlignmentB) == 128) ? cutlass::arch::CacheOperation::Global
+        ((sizeof_bits<uint2b_t>::value * kAlignmentB) == 128) ? cutlass::arch::CacheOperation::Global
             : cutlass::arch::CacheOperation::Always;
 
     // Define the MmaCore components
@@ -401,9 +401,9 @@ struct DefaultMma<cutlass::half_t, LayoutA, kAlignmentA, uint2b_t, LayoutB, kAli
 
     // Define iterators over tiles from the B operand
     using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-    using AccessTypeB = cutlass::Array<half_t, kAlignmentB>;
+    using AccessTypeB = cutlass::Array<uint2b_t, kAlignmentB>;
     using IteratorB = cutlass::transform::threadblock::PredicatedTileAccessIterator<
-        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>, half_t, LayoutB, 0, ThreadMapB,
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>, uint2b_t, LayoutB, 0, ThreadMapB,
         AccessTypeB>;
 
     // Define the threadblock-scoped multistage matrix multiply
@@ -446,7 +446,7 @@ struct DefaultMma<half_t, LayoutA, kAlignmentA, uint2b_t, LayoutB, kAlignmentB,
             : cutlass::arch::CacheOperation::Always;
 
     static cutlass::arch::CacheOperation::Kind const CacheOpB =
-        ((sizeof_bits<half_t>::value * kAlignmentB) == 128) ? cutlass::arch::CacheOperation::Global
+        ((sizeof_bits<uint2b_t>::value * kAlignmentB) == 128) ? cutlass::arch::CacheOperation::Global
             : cutlass::arch::CacheOperation::Always;
 
     // Define the MmaCore components
@@ -464,9 +464,9 @@ struct DefaultMma<half_t, LayoutA, kAlignmentA, uint2b_t, LayoutB, kAlignmentB,
 
     // Define iterators over tiles from the B operand
     using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-    using AccessTypeB = cutlass::Array<half_t, kAlignmentB>;
+    using AccessTypeB = cutlass::Array<uint2b_t, kAlignmentB>;
     using IteratorB = cutlass::transform::threadblock::PredicatedTileAccessIterator<
-        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>, half_t, LayoutB, 0, ThreadMapB,
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>, uint2b_t, LayoutB, 0, ThreadMapB,
         AccessTypeB>;
 
     // Define the threadblock-scoped multistage matrix multiply
diff --git a/custom_ops/gpu_ops/cutlass_extensions/gemm/threadblock/default_mma_bf16.h b/custom_ops/gpu_ops/cutlass_extensions/gemm/threadblock/default_mma_bf16.h
@@ -384,7 +384,7 @@ struct DefaultMma<cutlass::bfloat16_t, LayoutA, kAlignmentA, uint2b_t, LayoutB,
             : cutlass::arch::CacheOperation::Always;
 
     static cutlass::arch::CacheOperation::Kind const CacheOpB =
-        ((sizeof_bits<bfloat16_t>::value * kAlignmentB) == 128) ? cutlass::arch::CacheOperation::Global
+        ((sizeof_bits<uint2b_t>::value * kAlignmentB) == 128) ? cutlass::arch::CacheOperation::Global
             : cutlass::arch::CacheOperation::Always;
 
     // Define the MmaCore components
@@ -402,9 +402,9 @@ struct DefaultMma<cutlass::bfloat16_t, LayoutA, kAlignmentA, uint2b_t, LayoutB,
 
     // Define iterators over tiles from the B operand
     using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-    using AccessTypeB = cutlass::Array<bfloat16_t, kAlignmentB>;
+    using AccessTypeB = cutlass::Array<uint2b_t, kAlignmentB>;
     using IteratorB = cutlass::transform::threadblock::PredicatedTileAccessIterator<
-        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>, bfloat16_t, LayoutB, 0, ThreadMapB,
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>, uint2b_t, LayoutB, 0, ThreadMapB,
         AccessTypeB>;
 
     // Define the threadblock-scoped multistage matrix multiply
@@ -447,7 +447,7 @@ struct DefaultMma<bfloat16_t, LayoutA, kAlignmentA, uint2b_t, LayoutB, kAlignmen
             : cutlass::arch::CacheOperation::Always;
 
     static cutlass::arch::CacheOperation::Kind const CacheOpB =
-        ((sizeof_bits<bfloat16_t>::value * kAlignmentB) == 128) ? cutlass::arch::CacheOperation::Global
+        ((sizeof_bits<uint2b_t>::value * kAlignmentB) == 128) ? cutlass::arch::CacheOperation::Global
             : cutlass::arch::CacheOperation::Always;
 
     // Define the MmaCore components
@@ -465,9 +465,9 @@ struct DefaultMma<bfloat16_t, LayoutA, kAlignmentA, uint2b_t, LayoutB, kAlignmen
 
     // Define iterators over tiles from the B operand
     using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-    using AccessTypeB = cutlass::Array<bfloat16_t, kAlignmentB>;
+    using AccessTypeB = cutlass::Array<uint2b_t, kAlignmentB>;
     using IteratorB = cutlass::transform::threadblock::PredicatedTileAccessIterator<
-        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>, bfloat16_t, LayoutB, 0, ThreadMapB,
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>, uint2b_t, LayoutB, 0, ThreadMapB,
         AccessTypeB>;
 
     // Define the threadblock-scoped multistage matrix multiply
diff --git a/custom_ops/gpu_ops/cutlass_extensions/gemm/threadblock/wint2x_mma_base.h b/custom_ops/gpu_ops/cutlass_extensions/gemm/threadblock/wint2x_mma_base.h
@@ -104,8 +104,6 @@ class Wint2xMmaBase {
   using TensorRefB =
       TensorRef<typename Operator::ElementB, typename Operator::LayoutB>;
 
-  // using TensorRefZippedB = TensorRef<uint8_t, typename Operator::LayoutB>;
-
   static_assert(kWarpGemmIterations > 1,
                 "The pipelined structure requires at least two warp-level "
                 "GEMM operations.");
@@ -130,12 +128,11 @@ class Wint2xMmaBase {
                     Shape::kK * kStages + Policy::SmemPaddingA::kColumn>;
 
     /// Shape of the B matrix operand in shared memory
-    using ShapeB = MatrixShape<Shape::kK + Policy::SmemPaddingB::kRow,
+    using ShapeB = MatrixShape<Shape::kK * kStages + Policy::SmemPaddingB::kRow,
                                Shape::kN + Policy::SmemPaddingB::kColumn>;
 
     // w uint8; local_scale uint8;
-    constexpr static int kZippedRowsPerStages =
-	Shape::kK / 4 + (Shape::kK + 127) / 128;
+    constexpr static int kZippedRowsPerStages = Shape::kK / 4 + (Shape::kK + 127) / 128;
 
     // code_scale float; code_zp float; super_scale ElementB
     constexpr static int kColumnWiseParamsRows = 2 * sizeof(float) +
diff --git a/custom_ops/gpu_ops/cutlass_extensions/gemm/threadblock/wint2x_mma_multistage.h b/custom_ops/gpu_ops/cutlass_extensions/gemm/threadblock/wint2x_mma_multistage.h
@@ -90,7 +90,7 @@ template <
     SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone,
     /// Used for partial specialization
     typename Enable = bool>
-class Wint2xMmaMultistage : 
+class Wint2xMmaMultistage :
   public Wint2xMmaBase<Shape_, Policy_, Stages> {
 public:
   ///< Base class
@@ -282,20 +282,20 @@ class Wint2xMmaMultistage :
   {
     // Advance global iterators
     iterator_A.add_tile_offset({0, 1});
-    //iterator_B.add_tile_offset({1, 0});
-    tile_dequanter_B.AddTileOffset({1, 0});
+    iterator_B.add_tile_offset({1, 0});
+    //tile_dequanter_B.AddTileOffset({1, 0});
 
     // Advance shared iterators
     smem_iterator_A_.add_tile_offset({0, 1});
-    //smem_iterator_B_.add_tile_offset({1, 0});
+    smem_iterator_B_.add_tile_offset({1, 0});
 
     // Increment shared memory write stage index
     ++smem_write_stage_idx_;
 
     if (smem_write_stage_idx_ == Base::kStages) {
       // Wrap back around to the 'start' of the circular buffer in shared memory
       smem_iterator_A_.add_tile_offset({0, -Base::kStages});
-      //smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+      smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
       smem_write_stage_idx_ = 0;
     }
   }
@@ -476,8 +476,11 @@ class Wint2xMmaMultistage :
       copy_tiles_and_advance_per_stage_A(iterator_A);
 
       // Async copy zipped B to shared memory.
-      tile_dequanter_B.Load(smem_zipped_ptr_B_ + (stage % Base::kStages) * smem_zipped_bytes_per_stage_B_,
-                            column_wise_smem_ptr_B_, stage);
+      copy_tiles_and_advance_per_stage_B<false, true>(iterator_B);
+
+      // TODO: Async copy other quantized params to shared memory, local_scale, code_scale, code_zp, super_scale.
+      //tile_dequanter_B.Load(smem_zipped_ptr_B_ + (stage % Base::kStages) * smem_zipped_bytes_per_stage_B_,
+      //                      column_wise_smem_ptr_B_, stage);
 
       // Move to the next write stage
       advance_smem_write_stage(iterator_A, iterator_B, tile_dequanter_B);
@@ -566,11 +569,11 @@ class Wint2xMmaMultistage :
       if (warp_mma_k + 1 == Base::kWarpGemmIterations) {
         // Unpack and dequant the first stage of B.
         int unpack_stage = stage - Base::kStages + 2;
-        tile_dequanter_B.UnpackAndDequant(smem_zipped_ptr_B_ + (unpack_stage % Base::kStages) * smem_zipped_bytes_per_stage_B_,
-                                          column_wise_smem_ptr_B_, unpack_stage);
+        //tile_dequanter_B.UnpackAndDequant(smem_zipped_ptr_B_ + (unpack_stage % Base::kStages) * smem_zipped_bytes_per_stage_B_,
+        //                                  column_wise_smem_ptr_B_, unpack_stage);
 
         // Copy dequatized data to shared memory used by mma core.
-        copy_tiles_and_advance_per_stage_B<false, false>(iterator_B);
+        //copy_tiles_and_advance_per_stage_B<false, false>(iterator_B);
       }
 
       // Load the next warp-tile's B fragment from shared memory
@@ -672,10 +675,11 @@ class Wint2xMmaMultistage :
       IteratorB &iterator_B,
       TileDequanterB &tile_dequanter_B)        ///< [in|out] iterator over B operand in global memory
   {
+#if 0
     PipeState pipe_state;
 
     // Unpack and dequant the first stage of B.
-    tile_dequanter_B.UnpackAndDequant(smem_zipped_ptr_B_, column_wise_smem_ptr_B_, 0);
+    //tile_dequanter_B.UnpackAndDequant(smem_zipped_ptr_B_, column_wise_smem_ptr_B_, 0);
 
     // Disable global fetching if done with global fetch iterations
     iterator_A.clear_mask(gemm_k_iterations == 0);
@@ -687,7 +691,7 @@ class Wint2xMmaMultistage :
     ++this->warp_tile_iterator_A_;
 
     // Copy dequatized data to shared memory used by mma core.
-    copy_tiles_and_advance_per_stage_B<false, true>(iterator_B);
+    //copy_tiles_and_advance_per_stage_B<false, true>(iterator_B);
 
     // Load first warp-tile's B fragment from shared memory
     this->warp_tile_iterator_B_.set_kgroup_index(0);
@@ -730,6 +734,7 @@ class Wint2xMmaMultistage :
     cutlass::arch::cp_async_fence();
     cutlass::arch::cp_async_wait<0>();
     __syncthreads();
+  #endif
   }
 
   /// Prepares the class for another prologue.
@@ -794,7 +799,7 @@ class Wint2xMmaMultistage :
     accum = src_accum;
 
     // Perform the MAC-iterations
-    gemm_iters(gemm_k_iterations, accum, iterator_A, iterator_B, tile_dequanter_B);
+    //gemm_iters(gemm_k_iterations, accum, iterator_A, iterator_B, tile_dequanter_B);
   }
 };
 
diff --git a/custom_ops/gpu_ops/cutlass_extensions/gemm/threadblock/wint2x_tile_dequanter.h b/custom_ops/gpu_ops/cutlass_extensions/gemm/threadblock/wint2x_tile_dequanter.h
@@ -55,6 +55,9 @@ struct TileDequanter {
   bool need_preload{true};
   UnzipAndDequantFunctor unzip_functor;
 
+  CUTLASS_DEVICE
+  TileDequanter() {}
+
   CUTLASS_DEVICE
   TileDequanter(MmaElementT *out_smem_ptr, char *pointer, int64_t ldm,
                 const cutlass::MatrixCoord &extent,
diff --git a/custom_ops/gpu_ops/cutlass_kernels/moe_gemm/fused_moe_cutlass_kernel.h b/custom_ops/gpu_ops/cutlass_kernels/moe_gemm/fused_moe_cutlass_kernel.h
@@ -52,6 +52,15 @@ namespace cutlass {
 namespace gemm {
 namespace kernel {
 
+template <typename Layout> std::string GetCutlassLayoutString() {
+  if (std::is_same<Layout, cutlass::layout::RowMajor>::value) {
+    return "cutlass::layout::RowMajor";
+  } else if (std::is_same<Layout, cutlass::layout::ColumnMajor>::value) {
+    return "cutlass::layout::ColumnMajor";
+  }
+  return "unknown";
+}
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 // This section exists to that we can use the same kernel code for regular gemm
 // and dequantizing gemms. It will dispatch to the dequantizing gemm if the Mma
@@ -282,6 +291,27 @@ struct MoeFCGemm {
           platform::is_same<uint4b_t, ElementB>::value) {
         assert(weight_scales);
       }
+
+      CUTLASS_TRACE_HOST("[Arguments] problem_count: " << problem_count << ", threadblock_count: " << threadblock_count << ", gemm_n: " << gemm_n << ", gemm_k: " << gemm_k);
+      CUTLASS_TRACE_HOST("[Arguments] ptr_A: " << static_cast<void const*>(ptr_A));
+      CUTLASS_TRACE_HOST("[Arguments] ptr_B: " << static_cast<void const*>(ptr_B));
+      CUTLASS_TRACE_HOST("[Arguments] ptr_C: " << static_cast<void const*>(ptr_C));
+      CUTLASS_TRACE_HOST("[Arguments] ptr_D: " << static_cast<void*>(ptr_D));
+      CUTLASS_TRACE_HOST("[Arguments] weight_scales: " << static_cast<void const*>(weight_scales));
+      CUTLASS_TRACE_HOST("[Arguments] total_rows_before_expert: " << static_cast<void*>(total_rows_before_expert));
+      CUTLASS_TRACE_HOST("[Arguments] local_scale: " << static_cast<void const*>(local_scale));
+      CUTLASS_TRACE_HOST("[Arguments] code_scale: " << static_cast<void const*>(code_scale));
+      CUTLASS_TRACE_HOST("[Arguments] code_zp: " << static_cast<void const*>(code_zp));
+      CUTLASS_TRACE_HOST("[Arguments] quant_method: " << static_cast<int>(quant_method));
+      CUTLASS_TRACE_HOST("[Arguments] LayoutA: " << GetCutlassLayoutString<LayoutA>());
+      CUTLASS_TRACE_HOST("[Arguments] LayoutB: " << GetCutlassLayoutString<LayoutB>());
+      CUTLASS_TRACE_HOST("[Arguments] LayoutC: " << GetCutlassLayoutString<LayoutC>());
+      CUTLASS_TRACE_HOST("[Arguments] Mma::IteratorA::AccessType::kElements:" << Mma::IteratorA::AccessType::kElements);
+      CUTLASS_TRACE_HOST("[Arguments] Mma::IteratorB::AccessType::kElements:" << Mma::IteratorB::AccessType::kElements);
+      CUTLASS_TRACE_HOST("[Arguments] SharedStorage Information:");
+      CUTLASS_TRACE_HOST(" - ProblemVisitor::SharedStorage: " << sizeof(typename ProblemVisitor::SharedStorage) << " bytes");
+      CUTLASS_TRACE_HOST(" - Mma::SharedStorage: " << sizeof(typename Mma::SharedStorage) << " bytes");
+      CUTLASS_TRACE_HOST(" - Epilogue::SharedStorage: " << sizeof(typename Epilogue::SharedStorage) << " bytes");
     }
   };
 
@@ -835,6 +865,13 @@ struct Wint2xMoeFCGemm : public MoeFCGemm<Mma_, Epilogue_, ThreadblockSwizzle_,
         int32_t problem_idx = problem_visitor.problem_index();
         int32_t cta_idx = int32_t(problem_visitor.threadblock_idx());
 
+        CUTLASS_TRACE_DEVICE(" problem_idx: %d, cta_idx: %d, problem_size: {%d, %d, %d}",
+            problem_idx, cta_idx, static_cast<int>(problem_size.m()), static_cast<int>(problem_size.n()), static_cast<int>(problem_size.k()));
+
+        if (problem_idx > 2) {
+          break;
+        }
+
         GemmCoord grid_shape = problem_visitor.grid_shape(problem_size);
 
         // threadblock_offset of C
@@ -879,16 +916,16 @@ struct Wint2xMoeFCGemm : public MoeFCGemm<Mma_, Epilogue_, ThreadblockSwizzle_,
             platform::is_same<layout::RowMajor, LayoutB>::value
                 ? gemm_n
                 : gemm_k * kInterleave;
-        typename LayoutB::LongIndex ldm_B_shared = TileDequanterB::kColumns;
+        //typename LayoutB::LongIndex ldm_B_shared = TileDequanterB::kColumns;
 
         // the begin threadblock_offset of B, which holds the same column id with C
         cutlass::MatrixCoord tb_offset_B{0,
                                          threadblock_offset.n() / kInterleave};
 
         cutlass::MatrixCoord extent_B{problem_size.k() * kInterleave, problem_size.n() / kInterleave};
-        cutlass::MatrixCoord extent_B_shared{TileDequanterB::kRows, TileDequanterB::kColumns};
+        //cutlass::MatrixCoord extent_B_shared{TileDequanterB::kRows, TileDequanterB::kColumns};
 
-        MmaElementB* smem_unzip_B_ptr = nullptr;
+        /*MmaElementB* smem_unzip_B_ptr = nullptr;
         if constexpr (QuantMethod == WintQuantMethod::kWeightOnlyInt2) {
           smem_unzip_B_ptr = shared_storage.main_loop.operand_unzip_B_ptr();
         }
@@ -901,7 +938,9 @@ struct Wint2xMoeFCGemm : public MoeFCGemm<Mma_, Epilogue_, ThreadblockSwizzle_,
                                         weight_scale_ptr,
                                         tb_offset_scale,
                                         quant_args);
-        MmaElementB* ptr_B = tile_dequanter_B.GetOutPtr();
+        MmaElementB* ptr_B = tile_dequanter_B.GetOutPtr();*/
+        TileDequanterB tile_dequanter_B;
+        ElementB* ptr_B = reinterpret_cast<ElementB*>(byte_ptr_B);
 
         // Compute position within threadblock
         int thread_idx = threadIdx.x;
@@ -914,11 +953,11 @@ struct Wint2xMoeFCGemm : public MoeFCGemm<Mma_, Epilogue_, ThreadblockSwizzle_,
                                            tb_offset_A);
 
         typename Mma::IteratorB iterator_B(
-            LayoutB(TileDequanterB::kUseSharedMemory ? ldm_B_shared : ldm_B),
+            LayoutB(ldm_B),
             ptr_B,
-            TileDequanterB::kUseSharedMemory ? extent_B_shared : extent_B,
+            extent_B,
             thread_idx,
-            TileDequanterB::kUseSharedMemory ? cutlass::make_Coord(0, 0) : tb_offset_B);
+            tb_offset_B);
 
         typename Mma::FragmentC accumulators;
 
diff --git a/custom_ops/gpu_ops/cutlass_kernels/moe_gemm/fused_moe_gemm_kernels_template.h b/custom_ops/gpu_ops/cutlass_kernels/moe_gemm/fused_moe_gemm_kernels_template.h
diff --git a/custom_ops/gpu_ops/moe/moe_ffn_wint2.cu b/custom_ops/gpu_ops/moe/moe_ffn_wint2.cu