PaddlePaddle
diff --git a/‎custom_ops/gpu_ops/cutlass_extensions/gemm/threadblock/wint2x_mma_multistage.h
Lines changed: 120 additions & 25 deletions b/‎custom_ops/gpu_ops/cutlass_extensions/gemm/threadblock/wint2x_mma_multistage.h
Lines changed: 120 additions & 25 deletions
diff --git a/‎custom_ops/gpu_ops/cutlass_extensions/gemm/threadblock/wint2x_params_accessor.h
Lines changed: 2 additions & 2 deletions b/‎custom_ops/gpu_ops/cutlass_extensions/gemm/threadblock/wint2x_params_accessor.h
Lines changed: 2 additions & 2 deletions
diff --git a/‎custom_ops/gpu_ops/cutlass_extensions/gemm/warp/mma_tensorop_wint2x_dequantizer.h
Lines changed: 12 additions & 2 deletions b/‎custom_ops/gpu_ops/cutlass_extensions/gemm/warp/mma_tensorop_wint2x_dequantizer.h
Lines changed: 12 additions & 2 deletions
@@ -44,9 +44,9 @@
 #include "cutlass/numeric_types.h"
 
 #include "cutlass_extensions/arch/memory_copy_sm80.h"
-#include "cutlass_extensions/gemm/warp/mma_tensorop_wint2x_dequantizer.h"
 #include "cutlass_extensions/gemm/threadblock/wint2x_mma_base.h"
 #include "cutlass_extensions/gemm/threadblock/wint2x_params_accessor.h"
+#include "cutlass_extensions/gemm/warp/mma_tensorop_wint2x_dequantizer.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -292,32 +292,32 @@ class Wint2xMmaMultistage :
     int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
     int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
 
-    CUTLASS_TRACE_DEVICE(" Shape: {%d, %d, %d}, IteratorB::Shape: {%d, %d}, kInterleave: %d",
-        Shape::kM, Shape::kN, Shape::kK, IteratorB::Shape::kRow, IteratorB::Shape::kColumn, kInterleave);
-    CUTLASS_TRACE_DEVICE(" kPartitionsK=%d, kWarpGemmIterations=%d, WarpCount={%d, %d}, warp_idx_m=%d, warp_idx_n=%d, warp_idx_k=%d",
-        Policy::kPartitionsK, Base::kWarpGemmIterations,
-        Base::WarpCount::kM, Base::WarpCount::kN, warp_idx_m, warp_idx_n, warp_idx_k);
+    //CUTLASS_TRACE_DEVICE(" Shape: {%d, %d, %d}, IteratorB::Shape: {%d, %d}, kInterleave: %d",
+    //    Shape::kM, Shape::kN, Shape::kK, IteratorB::Shape::kRow, IteratorB::Shape::kColumn, kInterleave);
+    //CUTLASS_TRACE_DEVICE(" kPartitionsK=%d, kWarpGemmIterations=%d, WarpCount={%d, %d}, warp_idx_m=%d, warp_idx_n=%d, warp_idx_k=%d",
+    //    Policy::kPartitionsK, Base::kWarpGemmIterations,
+    //    Base::WarpCount::kM, Base::WarpCount::kN, warp_idx_m, warp_idx_n, warp_idx_k);
 
     // Add per-warp offsets in units of warp-level tiles
     this->warp_tile_iterator_A_.add_tile_offset(
         {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
     this->warp_tile_iterator_B_.add_tile_offset(
         {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
 
-    CUTLASS_TRACE_DEVICE(" Policy::SmemPaddingA: {%d, %d}; Policy::SmemPaddingB: {%d, %d}",
-        Policy::SmemPaddingA::kRow, Policy::SmemPaddingA::kColumn, Policy::SmemPaddingB::kRow, Policy::SmemPaddingB::kColumn);
-    CUTLASS_TRACE_DEVICE(" operand_A_ptr=%p, kRow=%d, kColumn=%d, %d bytes;  kElementsPerAccess=%d, sizeof(AccessType)=%d, AsyncCopyIterationsPerStageA=%d, kAccessesPerVectorA=%d",
-        shared_storage.operand_A.data(),
-        static_cast<int>(Base::SharedStorage::ShapeA::kRow), static_cast<int>(Base::SharedStorage::ShapeA::kColumn),
-        static_cast<int>(sizeof(shared_storage.operand_A)),
-        static_cast<int>(IteratorA::ThreadMap::kElementsPerAccess), static_cast<int>(sizeof(typename IteratorA::AccessType)),
-        static_cast<int>(Detail::AsyncCopyIterationsPerStageA), static_cast<int>(IteratorA::kAccessesPerVector));
-    CUTLASS_TRACE_DEVICE(" operand_B_ptr=%p, kRow=%d, kColumn=%d, %d bytes; kElementsPerAccess=%d, sizeof(AccessType)=%d, AsyncCopyIterationsPerStageB=%d, kAccessesPerVectorA=%d",
-        shared_storage.operand_B.data(),
-        static_cast<int>(Base::SharedStorage::ShapeB::kRow), static_cast<int>(Base::SharedStorage::ShapeB::kColumn),
-        static_cast<int>(sizeof(shared_storage.operand_B)),
-        static_cast<int>(IteratorB::ThreadMap::kElementsPerAccess), static_cast<int>(sizeof(typename IteratorB::AccessType)),
-        static_cast<int>(Detail::AsyncCopyIterationsPerStageB), static_cast<int>(IteratorB::kAccessesPerVector));
+    //CUTLASS_TRACE_DEVICE(" Policy::SmemPaddingA: {%d, %d}; Policy::SmemPaddingB: {%d, %d}",
+    //    Policy::SmemPaddingA::kRow, Policy::SmemPaddingA::kColumn, Policy::SmemPaddingB::kRow, Policy::SmemPaddingB::kColumn);
+    //CUTLASS_TRACE_DEVICE(" operand_A_ptr=%p, kRow=%d, kColumn=%d, %d bytes;  kElementsPerAccess=%d, sizeof(AccessType)=%d, AsyncCopyIterationsPerStageA=%d, kAccessesPerVectorA=%d",
+    //    shared_storage.operand_A.data(),
+    //    static_cast<int>(Base::SharedStorage::ShapeA::kRow), static_cast<int>(Base::SharedStorage::ShapeA::kColumn),
+    //    static_cast<int>(sizeof(shared_storage.operand_A)),
+    //    static_cast<int>(IteratorA::ThreadMap::kElementsPerAccess), static_cast<int>(sizeof(typename IteratorA::AccessType)),
+    //    static_cast<int>(Detail::AsyncCopyIterationsPerStageA), static_cast<int>(IteratorA::kAccessesPerVector));
+    //CUTLASS_TRACE_DEVICE(" operand_B_ptr=%p, kRow=%d, kColumn=%d, %d bytes; kElementsPerAccess=%d, sizeof(AccessType)=%d, AsyncCopyIterationsPerStageB=%d, kAccessesPerVectorA=%d",
+    //    shared_storage.operand_B.data(),
+    //    static_cast<int>(Base::SharedStorage::ShapeB::kRow), static_cast<int>(Base::SharedStorage::ShapeB::kColumn),
+    //    static_cast<int>(sizeof(shared_storage.operand_B)),
+    //    static_cast<int>(IteratorB::ThreadMap::kElementsPerAccess), static_cast<int>(sizeof(typename IteratorB::AccessType)),
+    //    static_cast<int>(Detail::AsyncCopyIterationsPerStageB), static_cast<int>(IteratorB::kAccessesPerVector));
 
     smem_ptr_A_ = reinterpret_cast<ElementA*>(shared_storage.operand_A.data());
     smem_ptr_B_ = reinterpret_cast<uint8_t*>(shared_storage.operand_B.data());
@@ -678,9 +678,11 @@ class Wint2xMmaMultistage :
     int &gemm_k_iterations, ///< [in|out] number of threadblock mainloop iterations remaining
     int stage)
   {
+
     // Unroll the warp-level MMA tiles of a threadblock's mainloop iteration
     CUTLASS_PRAGMA_UNROLL
     for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations; ++warp_mma_k) {
+
       // Load the next warp-tile's A fragment from shared memory
       this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
       this->warp_tile_iterator_A_.load(pipe_state.warp_frag_A_[(warp_mma_k + 1) % 2]);
@@ -699,31 +701,55 @@ class Wint2xMmaMultistage :
       }
 
       // Execute the current warp-tile of MMA operations
+
+      // CUTLASS_TRACE_DEVICE("ElementA %d", PipeState::WarpTransformedFragmentA::kElements);
+      // CUTLASS_TRACE_DEVICE("ElementB %d", PipeState::WarpTransformedFragmentB::kElements);
+      // CUTLASS_TRACE_DEVICE("kStagedAccumulation %d", Detail::kStagedAccumulation);
+
+      // uint8_t* reg_uint8_ptr = reinterpret_cast<uint8_t*>(pipe_state.warp_loaded_frag_B_[warp_mma_k % 2].data());
+      // CUTLASS_TRACE_DEVICE(" reg_uint8_ptr=[%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d], %d bytes",
+      //     static_cast<int>(reg_uint8_ptr[0]), static_cast<int>(reg_uint8_ptr[1]),
+      //     static_cast<int>(reg_uint8_ptr[2]), static_cast<int>(reg_uint8_ptr[3]),
+      //     static_cast<int>(reg_uint8_ptr[4]), static_cast<int>(reg_uint8_ptr[5]),
+      //     static_cast<int>(reg_uint8_ptr[6]), static_cast<int>(reg_uint8_ptr[7]),
+      //     static_cast<int>(reg_uint8_ptr[8]), static_cast<int>(reg_uint8_ptr[9]),
+      //     static_cast<int>(reg_uint8_ptr[10]), static_cast<int>(reg_uint8_ptr[11]),
+      //     static_cast<int>(reg_uint8_ptr[12]), static_cast<int>(reg_uint8_ptr[13]),
+      //     static_cast<int>(reg_uint8_ptr[14]), static_cast<int>(reg_uint8_ptr[15]),
+      //     sizeof_bits<typename PipeState::WarpLoadedFragmentB>::value / 8);
+
       if (Detail::kStagedAccumulation) {
         //CUTLASS_TRACE_DEVICE(" [MMa-kStagedAccumulation][stage=%d] warp_mma_k=%d, warp_k_compute_offset_B=%d", stage, warp_mma_k, warp_k_compute_offset_B);
         warp_mma_(
           pipe_state.tmp_accum_,
           pipe_state.warp_frag_A_[warp_mma_k % 2],
           pipe_state.warp_frag_B_,
+          // unpacked_frag_B,
           pipe_state.tmp_accum_,
           warp_k_compute_offset_B
         );
 
-        if (warp_mma_k == 0) {
-          plus<FragmentC> plus_accum;
-          accum = plus_accum(accum, pipe_state.tmp_accum_);
-          pipe_state.tmp_accum_.clear();
-        }
       } else {
         //CUTLASS_TRACE_DEVICE(" [MMa][stage=%d] warp_mma_k=%d, warp_k_compute_offset_B=%d", stage, warp_mma_k, warp_k_compute_offset_B);
         warp_mma_(
           accum,
           pipe_state.warp_frag_A_[warp_mma_k % 2],
           pipe_state.warp_frag_B_,
+          // unpacked_frag_B,
           accum,
           warp_k_compute_offset_B
         );
 #if 0
+        CUTLASS_TRACE_DEVICE(" pipe_state.warp_frag_B_=[%f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f]",
+            static_cast<float>(pipe_state.warp_frag_B_[0]), static_cast<float>(pipe_state.warp_frag_B_[1]),
+            static_cast<float>(pipe_state.warp_frag_B_[2]), static_cast<float>(pipe_state.warp_frag_B_[3]),
+            static_cast<float>(pipe_state.warp_frag_B_[4]), static_cast<float>(pipe_state.warp_frag_B_[5]),
+            static_cast<float>(pipe_state.warp_frag_B_[6]), static_cast<float>(pipe_state.warp_frag_B_[7]),
+            static_cast<float>(pipe_state.warp_frag_B_[8]), static_cast<float>(pipe_state.warp_frag_B_[9]),
+            static_cast<float>(pipe_state.warp_frag_B_[10]), static_cast<float>(pipe_state.warp_frag_B_[11]),
+            static_cast<float>(pipe_state.warp_frag_B_[12]), static_cast<float>(pipe_state.warp_frag_B_[13]),
+            static_cast<float>(pipe_state.warp_frag_B_[14]), static_cast<float>(pipe_state.warp_frag_B_[15]));
+
         if (FragmentC::kElements == 16) {
           CUTLASS_TRACE_DEVICE(" tile_C[0:15]=[%f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f]",
                 static_cast<float>(accum[0]), static_cast<float>(accum[1]),
@@ -735,6 +761,12 @@ class Wint2xMmaMultistage :
                 static_cast<float>(accum[12]), static_cast<float>(accum[13]),
                 static_cast<float>(accum[14]), static_cast<float>(accum[15]));
         }
+
+        // CUTLASS_TRACE_DEVICE_TID(" now1 warp_loaded_frag_A_[0:7]=[%f, %f, %f, %f, %f, %f, %f, %f]",
+        //     static_cast<float>(pipe_state.warp_loaded_frag_A_[warp_mma_k % 2][0]), static_cast<float>(pipe_state.warp_loaded_frag_A_[warp_mma_k % 2][1]),
+        //     static_cast<float>(pipe_state.warp_loaded_frag_A_[warp_mma_k % 2][2]), static_cast<float>(pipe_state.warp_loaded_frag_A_[warp_mma_k % 2][3]),
+        //     static_cast<float>(pipe_state.warp_loaded_frag_A_[warp_mma_k % 2][4]), static_cast<float>(pipe_state.warp_loaded_frag_A_[warp_mma_k % 2][5]),
+        //     static_cast<float>(pipe_state.warp_loaded_frag_A_[warp_mma_k % 2][6]), static_cast<float>(pipe_state.warp_loaded_frag_A_[warp_mma_k % 2][7]));
 #endif
       }
 
@@ -832,6 +864,7 @@ class Wint2xMmaMultistage :
     this->warp_tile_iterator_B_.load(pipe_state.warp_loaded_frag_B_);
     ++this->warp_tile_iterator_B_;
 
+#if 0
     if (PipeState::WarpLoadedFragmentA::kElements == 8) {
       ElementA* warp_frag_A_ptr = reinterpret_cast<ElementA*>(pipe_state.warp_frag_A_[0].data());
       CUTLASS_TRACE_DEVICE(" warp_frag_A_=[%f, %f, %f, %f, %f, %f, %f, %f], %d bytes",
@@ -841,6 +874,7 @@ class Wint2xMmaMultistage :
           static_cast<float>(warp_frag_A_ptr[6]), static_cast<float>(warp_frag_A_ptr[7]),
           sizeof_bits<typename PipeState::WarpLoadedFragmentA>::value / 8);
     }
+#endif
 #if 0
     if (PipeState::WarpLoadedFragmentB::kElements == 64) {
       uint8_t* reg_uint8_ptr = reinterpret_cast<uint8_t*>(pipe_state.warp_loaded_frag_B_.data());
@@ -881,8 +915,69 @@ class Wint2xMmaMultistage :
                                  pipe_state.warp_frag_B_,
                                  0);
 
+#if 0
+    if (TransformBAfterLDS::result_type::kElements == 64) {
+      CUTLASS_TRACE_DEVICE(" TransformBAfterLDS::result_type::kElements: 64, %d bytes", sizeof_bits<typename TransformBAfterLDS::result_type>::value / 8);
+      CUTLASS_TRACE_DEVICE(" warp_loaded_frag_B_[0:15]=[%f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f]",
+          static_cast<float>(unpacked_frag_B[0]), static_cast<float>(unpacked_frag_B[1]),
+          static_cast<float>(unpacked_frag_B[2]), static_cast<float>(unpacked_frag_B[3]),
+          static_cast<float>(unpacked_frag_B[4]), static_cast<float>(unpacked_frag_B[5]),
+          static_cast<float>(unpacked_frag_B[6]), static_cast<float>(unpacked_frag_B[7]),
+          static_cast<float>(unpacked_frag_B[8]), static_cast<float>(unpacked_frag_B[9]),
+          static_cast<float>(unpacked_frag_B[10]), static_cast<float>(unpacked_frag_B[11]),
+          static_cast<float>(unpacked_frag_B[12]), static_cast<float>(unpacked_frag_B[13]),
+          static_cast<float>(unpacked_frag_B[14]), static_cast<float>(unpacked_frag_B[15]));
+      CUTLASS_TRACE_DEVICE(" warp_loaded_frag_B_[16:31]=[%f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f]",
+          static_cast<float>(unpacked_frag_B[16]), static_cast<float>(unpacked_frag_B[17]),
+          static_cast<float>(unpacked_frag_B[18]), static_cast<float>(unpacked_frag_B[19]),
+          static_cast<float>(unpacked_frag_B[20]), static_cast<float>(unpacked_frag_B[21]),
+          static_cast<float>(unpacked_frag_B[22]), static_cast<float>(unpacked_frag_B[23]),
+          static_cast<float>(unpacked_frag_B[24]), static_cast<float>(unpacked_frag_B[25]),
+          static_cast<float>(unpacked_frag_B[26]), static_cast<float>(unpacked_frag_B[27]),
+          static_cast<float>(unpacked_frag_B[28]), static_cast<float>(unpacked_frag_B[29]),
+          static_cast<float>(unpacked_frag_B[30]), static_cast<float>(unpacked_frag_B[31]));
+      CUTLASS_TRACE_DEVICE(" warp_loaded_frag_B_[32:47]=[%f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f]",
+          static_cast<float>(unpacked_frag_B[32]), static_cast<float>(unpacked_frag_B[33]),
+          static_cast<float>(unpacked_frag_B[34]), static_cast<float>(unpacked_frag_B[35]),
+          static_cast<float>(unpacked_frag_B[36]), static_cast<float>(unpacked_frag_B[37]),
+          static_cast<float>(unpacked_frag_B[38]), static_cast<float>(unpacked_frag_B[39]),
+          static_cast<float>(unpacked_frag_B[40]), static_cast<float>(unpacked_frag_B[41]),
+          static_cast<float>(unpacked_frag_B[42]), static_cast<float>(unpacked_frag_B[43]),
+          static_cast<float>(unpacked_frag_B[44]), static_cast<float>(unpacked_frag_B[45]),
+          static_cast<float>(unpacked_frag_B[46]), static_cast<float>(unpacked_frag_B[47]));
+      CUTLASS_TRACE_DEVICE(" warp_loaded_frag_B_[48:63]=[%f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f]",
+          static_cast<float>(unpacked_frag_B[48]), static_cast<float>(unpacked_frag_B[49]),
+          static_cast<float>(unpacked_frag_B[50]), static_cast<float>(unpacked_frag_B[51]),
+          static_cast<float>(unpacked_frag_B[52]), static_cast<float>(unpacked_frag_B[53]),
+          static_cast<float>(unpacked_frag_B[54]), static_cast<float>(unpacked_frag_B[55]),
+          static_cast<float>(unpacked_frag_B[56]), static_cast<float>(unpacked_frag_B[57]),
+          static_cast<float>(unpacked_frag_B[58]), static_cast<float>(unpacked_frag_B[59]),
+          static_cast<float>(unpacked_frag_B[60]), static_cast<float>(unpacked_frag_B[61]),
+          static_cast<float>(unpacked_frag_B[62]), static_cast<float>(unpacked_frag_B[63]));
+    }
+#endif
+
     if (Detail::kStagedAccumulation) {
       pipe_state.tmp_accum_.clear();
+      CUTLASS_TRACE_DEVICE(" before tmp_accum_[0:15]=[%f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f]",
+          static_cast<float>(pipe_state.tmp_accum_[0]), static_cast<float>(pipe_state.tmp_accum_[1]),
+          static_cast<float>(pipe_state.tmp_accum_[2]), static_cast<float>(pipe_state.tmp_accum_[3]),
+          static_cast<float>(pipe_state.tmp_accum_[4]), static_cast<float>(pipe_state.tmp_accum_[5]),
+          static_cast<float>(pipe_state.tmp_accum_[6]), static_cast<float>(pipe_state.tmp_accum_[7]),
+          static_cast<float>(pipe_state.tmp_accum_[8]), static_cast<float>(pipe_state.tmp_accum_[9]),
+          static_cast<float>(pipe_state.tmp_accum_[10]), static_cast<float>(pipe_state.tmp_accum_[11]),
+          static_cast<float>(pipe_state.tmp_accum_[12]), static_cast<float>(pipe_state.tmp_accum_[13]),
+          static_cast<float>(pipe_state.tmp_accum_[14]), static_cast<float>(pipe_state.tmp_accum_[15]));
+    } else {
+      CUTLASS_TRACE_DEVICE(" before tmp_accum_[0:15]=[%f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f]",
+          static_cast<float>(accum[0]), static_cast<float>(accum[1]),
+          static_cast<float>(accum[2]), static_cast<float>(accum[3]),
+          static_cast<float>(accum[4]), static_cast<float>(accum[5]),
+          static_cast<float>(accum[6]), static_cast<float>(accum[7]),
+          static_cast<float>(accum[8]), static_cast<float>(accum[9]),
+          static_cast<float>(accum[10]), static_cast<float>(accum[11]),
+          static_cast<float>(accum[12]), static_cast<float>(accum[13]),
+          static_cast<float>(accum[14]), static_cast<float>(accum[15]));
     }
 
     int stage = Base::kStages - 1;
 
@@ -187,8 +187,8 @@ class Wint2ParamsAccessor {
       smem_write_stage_idx_(0),
       smem_read_stage_idx_(0)
   {
-    CUTLASS_TRACE_DEVICE(" Shape: {%d, %d, %d}, kSmemRows=%d, kSmemColumns=%d, kLocalScaleRows=%d, kStagesPerLocalScaleLoad=%d",
-        Shape::kM, Shape::kN, Shape::kK, kSmemRows, kSmemColumns, kLocalScaleRows, kStagesPerLocalScaleLoad);
+    //CUTLASS_TRACE_DEVICE(" Shape: {%d, %d, %d}, kSmemRows=%d, kSmemColumns=%d, kLocalScaleRows=%d, kStagesPerLocalScaleLoad=%d",
+    //    Shape::kM, Shape::kN, Shape::kK, kSmemRows, kSmemColumns, kLocalScaleRows, kStagesPerLocalScaleLoad);
     //CUTLASS_TRACE_DEVICE(" IteratorSuperScale::Shape: {%d, %d}, kSuperScaleSmemOffset=%d, smem_ptr=%p",
     //    IteratorSuperScale::Shape::kRow, IteratorSuperScale::Shape::kColumn, kSuperScaleSmemOffset, get_super_scale_smem_ptr());
     //CUTLASS_TRACE_DEVICE(" IteratorLocalScale::Shape: {%d, %d}, kLocalScaleSmemOffset=%d, smem_ptr=%p",
 
@@ -287,7 +287,7 @@ class MmaTensorOpWin2xDequantizer<
                 static_cast<ElementCompute>(shifted_local_scale) * static_cast<ElementCompute>(super_scale_frag[i]);
         }
 
-#if 1
+#if 0
         if (FragmentCompute::kElements == 4) {
         CUTLASS_TRACE_DEVICE(" [stage=%d] tb_offset_k=%d, local_scale_shift=%d, scale_frag[0:3]=[%f, %f, %f, %f], sizeof(FragmentCompute)=%d bytes",
                 stage, tb_offset_k, local_scale_shift,
@@ -312,7 +312,7 @@ class MmaTensorOpWin2xDequantizer<
         }
 
         if (FragmentOutput::kElements == 64) {
-#if 1
+#if 0
             CUTLASS_TRACE_DEVICE(" [stage=%d] output_frag[0:15]=[%f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f]",
                 stage,
                 static_cast<float>(output_frag[0]), static_cast<float>(output_frag[1]),
@@ -362,6 +362,16 @@ class MmaTensorOpWin2xDequantizer<
         // avoid numerous conversion instructions in GEMM main loop.
         arch::device_breakpoint();
 #endif
+
+        const int fixed_values[64] = {
+            0, 1, 8, 9, 16, 17, 24, 25, 32, 33, 40, 41, 48, 49, 56, 57,
+            2, 3, 10, 11, 18, 19, 26, 27, 34, 35, 42, 43, 50, 51, 58, 59,
+            4, 5, 12, 13, 20, 21, 28, 29, 36, 37, 44, 45, 52, 53, 60, 61,
+            6, 7, 14, 15, 22, 23, 30, 31, 38, 39, 46, 47, 54, 55, 62, 63
+        };
+        for (int i = 0; i < FragmentUnpack::kElements; ++i) {
+            output_frag[i] = static_cast<typename FragmentUnpack::Element>(fixed_values[(i % 16) + (threadIdx.x % 4) * 16]);
+        }
     }
 
     /// Add an offset to pointer in units of elements.