add pingpong buffer for loaded_b_frag

baoqiwen · baoqiwen · commit e0b366f39f93 · 2025-07-15T20:21:55.000+08:00
diff --git a/custom_ops/gpu_ops/cutlass_extensions/gemm/threadblock/wint2x_mma_multistage.h b/custom_ops/gpu_ops/cutlass_extensions/gemm/threadblock/wint2x_mma_multistage.h
@@ -209,7 +209,7 @@ class Wint2xMmaMultistage :
     WarpTransformedFragmentA warp_frag_A_[2];
 
     /// Pair of B fragments used to overlap shared memory loads and math instructions
-    WarpLoadedFragmentB warp_loaded_frag_B_;
+    WarpLoadedFragmentB warp_loaded_frag_B_[2];
     WarpTransformedFragmentB warp_frag_B_;
   };
 
@@ -691,10 +691,10 @@ class Wint2xMmaMultistage :
       int warp_k_compute_offset_B = warp_mma_k % Base::kWarpGemmIterationsPerLoadForB;
       int warp_mma_k_for_B = warp_mma_k / Base::kWarpGemmIterationsPerLoadForB;
 
-      if (warp_k_compute_offset_B  == Base::kWarpGemmIterationsPerLoadForB - 1) {
+      if (warp_k_compute_offset_B == Base::kWarpGemmIterationsPerLoadForB - 1) {
         // Load the next warp-tile's B fragment from shared memory
         this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k_for_B + 1) % Base::kWarpGemmIterations);
-        this->warp_tile_iterator_B_.load(pipe_state.warp_loaded_frag_B_);
+        this->warp_tile_iterator_B_.load(pipe_state.warp_loaded_frag_B_[(warp_mma_k_for_B + 1) % 2]);
         ++this->warp_tile_iterator_B_;
 
         warp_dequantizer_.load(pipe_state.warp_frag_local_scale_);
@@ -718,6 +718,16 @@ class Wint2xMmaMultistage :
       //     static_cast<int>(reg_uint8_ptr[14]), static_cast<int>(reg_uint8_ptr[15]),
       //     sizeof_bits<typename PipeState::WarpLoadedFragmentB>::value / 8);
 
+      if (warp_k_compute_offset_B == 0) {
+        warp_dequantizer_.dequantize(pipe_state.warp_frag_local_scale_,
+                                     pipe_state.warp_frag_code_scale_,
+                                     pipe_state.warp_frag_code_zp_,
+                                     pipe_state.warp_frag_super_scale_,
+                                     pipe_state.warp_loaded_frag_B_[warp_mma_k_for_B % 2],
+                                     pipe_state.warp_frag_B_,
+                                     (stage - Base::kStages + 2) * Shape::kK);
+      }
+
       if (Detail::kStagedAccumulation) {
         //CUTLASS_TRACE_DEVICE(" [MMa-kStagedAccumulation][stage=%d] warp_mma_k=%d, warp_k_compute_offset_B=%d", stage, warp_mma_k, warp_k_compute_offset_B);
         warp_mma_(
@@ -767,27 +777,6 @@ class Wint2xMmaMultistage :
         //     static_cast<float>(pipe_state.warp_loaded_frag_A_[warp_mma_k % 2][2]), static_cast<float>(pipe_state.warp_loaded_frag_A_[warp_mma_k % 2][3]),
         //     static_cast<float>(pipe_state.warp_loaded_frag_A_[warp_mma_k % 2][4]), static_cast<float>(pipe_state.warp_loaded_frag_A_[warp_mma_k % 2][5]),
         //     static_cast<float>(pipe_state.warp_loaded_frag_A_[warp_mma_k % 2][6]), static_cast<float>(pipe_state.warp_loaded_frag_A_[warp_mma_k % 2][7]));
-
-        // CUTLASS_TRACE_DEVICE_TID(" now1 unpacked_frag_B[0:15]=[%f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f]",
-        //       static_cast<float>(unpacked_frag_B[0]), static_cast<float>(unpacked_frag_B[1]),
-        //       static_cast<float>(unpacked_frag_B[2]), static_cast<float>(unpacked_frag_B[3]),
-        //       static_cast<float>(unpacked_frag_B[4]), static_cast<float>(unpacked_frag_B[5]),
-        //       static_cast<float>(unpacked_frag_B[6]), static_cast<float>(unpacked_frag_B[7]),
-        //       static_cast<float>(unpacked_frag_B[8]), static_cast<float>(unpacked_frag_B[9]),
-        //       static_cast<float>(unpacked_frag_B[10]), static_cast<float>(unpacked_frag_B[11]),
-        //       static_cast<float>(unpacked_frag_B[12]), static_cast<float>(unpacked_frag_B[13]),
-        //       static_cast<float>(unpacked_frag_B[14]), static_cast<float>(unpacked_frag_B[15]));
-
-        // CUTLASS_TRACE_DEVICE_TID(" warp_k_compute_offset_B = %d, now1 tmp_accum_[0:15]=[%f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f]",
-        //       warp_k_compute_offset_B,
-        //       static_cast<float>(accum[0]), static_cast<float>(accum[1]),
-        //       static_cast<float>(accum[2]), static_cast<float>(accum[3]),
-        //       static_cast<float>(accum[4]), static_cast<float>(accum[5]),
-        //       static_cast<float>(accum[6]), static_cast<float>(accum[7]),
-        //       static_cast<float>(accum[8]), static_cast<float>(accum[9]),
-        //       static_cast<float>(accum[10]), static_cast<float>(accum[11]),
-        //       static_cast<float>(accum[12]), static_cast<float>(accum[13]),
-        //       static_cast<float>(accum[14]), static_cast<float>(accum[15]));
 #endif
       }
 
@@ -835,16 +824,6 @@ class Wint2xMmaMultistage :
         iterator_B.clear_mask(gemm_k_iterations == 0);
         quant_params_accessor_B_.clear_mask(mma_quant_args, gemm_k_iterations == 0);
       }
-
-      if (warp_k_compute_offset_B == Base::kWarpGemmIterationsPerLoadForB - 1) {
-        warp_dequantizer_.dequantize(pipe_state.warp_frag_local_scale_,
-                                     pipe_state.warp_frag_code_scale_,
-                                     pipe_state.warp_frag_code_zp_,
-                                     pipe_state.warp_frag_super_scale_,
-                                     pipe_state.warp_loaded_frag_B_,
-                                     pipe_state.warp_frag_B_,
-                                     (stage - Base::kStages + 2) * Shape::kK);
-      }
     }
   }
 
@@ -882,7 +861,7 @@ class Wint2xMmaMultistage :
 
     // Load first warp-tile's B fragment from shared memory
     this->warp_tile_iterator_B_.set_kgroup_index(0);
-    this->warp_tile_iterator_B_.load(pipe_state.warp_loaded_frag_B_);
+    this->warp_tile_iterator_B_.load(pipe_state.warp_loaded_frag_B_[0]);
     ++this->warp_tile_iterator_B_;
 
 #if 0
@@ -928,14 +907,6 @@ class Wint2xMmaMultistage :
     }
 #endif
 
-    warp_dequantizer_.dequantize(pipe_state.warp_frag_local_scale_,
-                                 pipe_state.warp_frag_code_scale_,
-                                 pipe_state.warp_frag_code_zp_,
-                                 pipe_state.warp_frag_super_scale_,
-                                 pipe_state.warp_loaded_frag_B_,
-                                 pipe_state.warp_frag_B_,
-                                 0);
-
 #if 0
     if (TransformBAfterLDS::result_type::kElements == 64) {
       CUTLASS_TRACE_DEVICE(" TransformBAfterLDS::result_type::kElements: 64, %d bytes", sizeof_bits<typename TransformBAfterLDS::result_type>::value / 8);