PaddlePaddle
diff --git a/‎custom_ops/gpu_ops/cutlass_extensions/gemm/threadblock/wint2x_mma_multistage.h
Lines changed: 53 additions & 9 deletions b/‎custom_ops/gpu_ops/cutlass_extensions/gemm/threadblock/wint2x_mma_multistage.h
Lines changed: 53 additions & 9 deletions
@@ -47,6 +47,8 @@
 #include "cutlass_extensions/gemm/threadblock/wint2x_mma_base.h"
 #include "cutlass_extensions/gemm/threadblock/wint2x_tile_dequanter.h"
 
+#include "cutlass_extensions/gemm/warp/mma_tensorop_wint2x_dequantizer.h"
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 namespace cutlass {
@@ -129,6 +131,17 @@ class Wint2xMmaMultistage :
   /// Minimum architecture is Sm80 to support cp.async
   using ArchTag = arch::Sm80;
 
+  using LayoutScale = cutlass::layout::ColumnMajor;
+  using WarpTransformedFragmentB = typename Operator::TransformedFragmentB;
+  using ElementB = typename WarpTransformedFragmentB::Element;
+  using Dequantizer = warp::MmaTensorOpWin2xDequantizer<Operator, typename Base::WarpGemm, Operand::kB, ElementB,
+    cutlass::layout::ColumnMajor, 32, WeightOnlyQuantOp::UNDEFINED>;
+
+  static_assert(
+      sizeof(Dequantizer) > 0,
+      "Dequantizer template instantiation failed"
+  );
+
   /// Complex transform on A operand
   static ComplexTransform const kTransformA = Operator::kTransformA;
 
@@ -196,6 +209,9 @@ class Wint2xMmaMultistage :
   /// Warp-level MMA operator
   Operator warp_mma_;
 
+  // Wint2 unzip operator
+  Dequantizer warp_dequantizer_;
+
   /// Iterator to write threadblock-scoped tile of A operand to shared memory
   SmemIteratorA smem_iterator_A_;
 
@@ -679,12 +695,41 @@ class Wint2xMmaMultistage :
     this->warp_tile_iterator_B_.load(pipe_state.warp_loaded_frag_B_[0]);
     ++this->warp_tile_iterator_B_;
 
-    // Transform, if necessary, the first warp-tile's shared memory fragments
-    warp_mma_.transform(
-      pipe_state.warp_transformed_frag_A_[0],
-      pipe_state.warp_transformed_frag_B_[0],
-      pipe_state.warp_loaded_frag_A_[0],
-      pipe_state.warp_loaded_frag_B_[0]);
+    // // Transform, if necessary, the first warp-tile's shared memory fragments
+    // warp_mma_.transform(
+    //   pipe_state.warp_transformed_frag_A_[0],
+    //   pipe_state.warp_transformed_frag_B_[0],
+    //   pipe_state.warp_loaded_frag_A_[0],
+    //   pipe_state.warp_loaded_frag_B_[0]);
+
+    __syncthreads(); // 确保所有线程执行到此处
+    if (threadIdx.x == 0) { // 仅让一个线程打印，避免重复输出
+        // printf("DEBUG: warp_loaded_frag_A_[0] values:\n");
+        for (int i = 0; i < pipe_state.warp_loaded_frag_A_[0].size(); ++i) {
+          // 读取 fragment 中的元素
+          auto val = pipe_state.warp_loaded_frag_A_[0][i];
+ 
+          // 以 16-bit 形式 reinterpret 为 uint16_t 查看原始位模式
+          uint16_t bits = reinterpret_cast<const uint16_t*>(&val)[0];
+
+          CUTLASS_TRACE_DEVICE(" warp_loaded_frag_A_[%d] = 0x%04x", i, bits);        
+      }
+    }
+    __syncthreads();
+
+    typename Dequantizer::FragmentLocalScale warp_frag_local_scale;
+    typename Dequantizer::FragmentCodeScale warp_frag_code_scale;
+    typename Dequantizer::FragmentCodeZp warp_frag_code_zp;
+    typename Dequantizer::FragmentSuperScale warp_frag_super_scale;
+
+    typename Dequantizer::FragmentOutOperand warp_frag_out;
+
+    CUTLASS_TRACE_DEVICE(" warp_dequantizer_ - start load");
+    warp_dequantizer_.load(warp_frag_local_scale, warp_frag_code_scale, warp_frag_code_zp, warp_frag_super_scale);
+    __syncthreads();
+
+    CUTLASS_TRACE_DEVICE("warp_dequantizer_ - start dequant");
+    warp_dequantizer_.dequantize(warp_frag_out, pipe_state.warp_loaded_frag_B_[0], warp_frag_local_scale, warp_frag_code_scale, warp_frag_code_zp, warp_frag_super_scale);
 
     if (Detail::kStagedAccumulation) {
       pipe_state.tmp_accum_.clear();
@@ -770,18 +815,17 @@ class Wint2xMmaMultistage :
       TileDequanterB tile_dequanter_B,
       ///< initial value of accumulator
       FragmentC const &src_accum) {
-
     // Prologue (start fetching iterations of global fragments into shared memory)
     prologue(iterator_A, iterator_B, tile_dequanter_B, gemm_k_iterations);
 
     // Wait until we have at least one completed global fetch stage
     gmem_wait();
-
+    
     // Initialize destination accumulators with source accumulators
     accum = src_accum;
 
     // Perform the MAC-iterations
-    //gemm_iters(gemm_k_iterations, accum, iterator_A, iterator_B, tile_dequanter_B);
+    gemm_iters(gemm_k_iterations, accum, iterator_A, iterator_B, tile_dequanter_B);
   }
 };