PaddlePaddle
diff --git a/‎custom_ops/gpu_ops/cutlass_extensions/gemm/threadblock/default_wint2x_mma.h
Lines changed: 5 additions & 1 deletion b/‎custom_ops/gpu_ops/cutlass_extensions/gemm/threadblock/default_wint2x_mma.h
Lines changed: 5 additions & 1 deletion
diff --git a/‎custom_ops/gpu_ops/cutlass_extensions/gemm/threadblock/wint2x_mma_base.h
Lines changed: 19 additions & 19 deletions b/‎custom_ops/gpu_ops/cutlass_extensions/gemm/threadblock/wint2x_mma_base.h
Lines changed: 19 additions & 19 deletions
@@ -19,6 +19,7 @@
 
 #include "cutlass_extensions/arch/mma.h"
 #include "cutlass_extensions/interleaved_numeric_conversion.h"
+#include "cutlass_extensions/gemm/threadblock/default_dq_mma.h"
 #include "cutlass_extensions/gemm/threadblock/wint2x_mma_multistage.h"
 
 namespace cutlass {
@@ -156,13 +157,16 @@ struct DefaultWint2xMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlig
         IteratorShapeB, ElementB, layout::ColumnMajor, 0, InterleavedThreadMapB,
         AccessTypeB>;
 
+    using TransformBAfterLDS = FastInterleavedAndBiasedNumericArrayConverter<
+        ElementA, ElementB, MmaCore::MmaPolicy::Operator::FragmentB::kElements>;
+
     // Define the threadblock-scoped multistage matrix multiply
     using ThreadblockMma = cutlass::gemm::threadblock::Wint2xMmaMultistage<
         typename MmaCore::Shape,
         IteratorA, typename MmaCore::SmemIteratorA, MmaCore::kCacheOpA,
         IteratorB, typename MmaCore::SmemIteratorB, MmaCore::kCacheOpB,
         ElementAccumulator, layout::RowMajor,
-        typename MmaCore::MmaPolicy, kStages, SharedMemoryClear>;
+        typename MmaCore::MmaPolicy, kStages, TransformBAfterLDS, SharedMemoryClear>;
 };
 
 } // namespace threadblock
 
@@ -93,6 +93,15 @@ class Wint2xMmaBase {
   static int const kWarpGemmIterations =
       (WarpGemm::kK / Operator::Policy::MmaShape::kK);
 
+  /// Number of warp-level GEMM oeprations per load for B
+  static constexpr int kWarpGemmIterationsPerLoadForB =
+      Operator::IteratorB::InstructionShape::kRow / Operator::InstructionShape::kK;
+  static_assert(!(kWarpGemmIterations % kWarpGemmIterationsPerLoadForB), "");
+
+  static constexpr int kWarpLoadIterationsForB =
+      kWarpGemmIterations / kWarpGemmIterationsPerLoadForB;
+
+
   /// Number of stages
   static int const kStages = Stages;
 
@@ -131,16 +140,16 @@ class Wint2xMmaBase {
     using ShapeB = MatrixShape<Shape::kK * kStages + Policy::SmemPaddingB::kRow,
                                Shape::kN + Policy::SmemPaddingB::kColumn>;
 
-    // w uint8; local_scale uint8;
-    constexpr static int kZippedRowsPerStages = Shape::kK / 4 + (Shape::kK + 127) / 128;
+    // local_scale uint4
+    constexpr static int kGroupWiseParamRows = Shape::kK / 64;
+
+    using GroupWiseParamShapeB = MatrixShape<kGroupWiseParamRows * kStages, Shape::kN>;
 
     // code_scale float; code_zp float; super_scale ElementB
-    constexpr static int kColumnWiseParamsRows = 2 * sizeof(float) +
+    constexpr static int kColumnWiseParamRows = 2 * sizeof(float) +
         sizeof_bits<typename Operator::ElementB>::value / 8;
 
-    using ZippedShapeB = MatrixShape<kColumnWiseParamsRows + kZippedRowsPerStages * kStages, Shape::kN>;
-
-    using NopaddingShapeB = MatrixShape<Shape::kK, Shape::kN>;
+    using ColumnWiseParamShapeB = MatrixShape<kColumnWiseParamRows, Shape::kN>;
 
   public:
     //
@@ -153,12 +162,11 @@ class Wint2xMmaBase {
     /// Buffer for B operand
     AlignedBuffer<typename Operator::ElementB, ShapeB::kCount> operand_B;
 
-    /// Buffer for quanted B operand
-    AlignedBuffer<uint8_t, ZippedShapeB::kCount> operand_zipped_B;
+    /// Buffer for local_scale of B operand
+    AlignedBuffer<uint4b_t, GroupWiseParamShapeB::kCount> operand_local_scale_B;
 
-    /// Buffer for unzip B operand
-    AlignedBuffer<typename Operator::ElementB, NopaddingShapeB::kCount>
-        operand_unzip_B;
+    /// Buffer for column-wise params of B operand
+    AlignedBuffer<uint8_t, ColumnWiseParamShapeB::kCount> operand_column_wise_B;
 
   public:
     //
@@ -188,14 +196,6 @@ class Wint2xMmaBase {
     TensorRefB operand_B_ref() {
       return TensorRefB{operand_B.data(), LayoutB()};
     }
-
-    CUTLASS_HOST_DEVICE
-    uint8_t *operand_zipped_B_ptr() { return operand_zipped_B.data(); }
-
-    CUTLASS_HOST_DEVICE
-    typename Operator::ElementB *operand_unzip_B_ptr() {
-      return operand_unzip_B.data();
-    }
   };
 
 protected: