[Perf] SM100 FP8 GEMM Optimizations after cutlass_profiler (vllm-project#20071)

ilmarkov · ilmarkov · web-flow · commit 8f9d57eeb2bb · 2025-06-26T20:50:09.000-07:00
Signed-off-by: ilmarkov &lt;imarkov@redhat.com&gt;
Co-authored-by: ilmarkov &lt;imarkov@redhat.com&gt;
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh
@@ -29,40 +29,40 @@ struct sm100_fp8_config_default {
 template <typename InType, typename OutType,
           template <typename, typename, typename> typename Epilogue>
 struct sm100_fp8_config_M256 {
-  // M in (128, 256]
+  // M in (64, 256]
   static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
   using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
   using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
   using TileShape = Shape<_128, _128, _128>;
-  using ClusterShape = Shape<_2, _2, _1>;
+  using ClusterShape = Shape<_2, _1, _1>;
   using Cutlass3xGemm =
       cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
                             KernelSchedule, EpilogueSchedule>;
 };
 
 template <typename InType, typename OutType,
           template <typename, typename, typename> typename Epilogue>
-struct sm100_fp8_config_M128 {
-  // M in (64, 128]
+struct sm100_fp8_config_M64 {
+  // M in (16, 64]
   static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
   using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
   using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
-  using TileShape = Shape<_128, _128, _256>;
-  using ClusterShape = Shape<_2, _4, _1>;
+  using TileShape = Shape<_64, _64, _128>;
+  using ClusterShape = Shape<_1, _1, _1>;
   using Cutlass3xGemm =
       cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
                             KernelSchedule, EpilogueSchedule>;
 };
 
 template <typename InType, typename OutType,
           template <typename, typename, typename> typename Epilogue>
-struct sm100_fp8_config_M64 {
-  // M in [1, 64]
+struct sm100_fp8_config_M16 {
+  // M in [1, 16]
   static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
   using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
   using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
-  using TileShape = Shape<_64, _64, _256>;
-  using ClusterShape = Shape<_1, _8, _1>;
+  using TileShape = Shape<_64, _64, _128>;
+  using ClusterShape = Shape<_1, _4, _1>;
   using Cutlass3xGemm =
       cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
                             KernelSchedule, EpilogueSchedule>;
@@ -82,27 +82,27 @@ inline void cutlass_gemm_sm100_fp8_dispatch(torch::Tensor& out,
   using Cutlass3xGemmDefault =
       typename sm100_fp8_config_default<InType, OutType,
                                         Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM16 =
+      typename sm100_fp8_config_M16<InType, OutType, Epilogue>::Cutlass3xGemm;
   using Cutlass3xGemmM64 =
       typename sm100_fp8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
-  using Cutlass3xGemmM128 =
-      typename sm100_fp8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
   using Cutlass3xGemmM256 =
       typename sm100_fp8_config_M256<InType, OutType, Epilogue>::Cutlass3xGemm;
 
   uint32_t const m = a.size(0);
   uint32_t const mp2 =
-      std::max(static_cast<uint32_t>(64), next_pow_2(m));  // next power of 2
+      std::max(static_cast<uint32_t>(16), next_pow_2(m));  // next power of 2
 
-  if (mp2 <= 64) {
-    // m in [1, 64]
-    return cutlass_gemm_caller<Cutlass3xGemmM64>(
+  if (mp2 <= 16) {
+    // m in [1, 16]
+    return cutlass_gemm_caller<Cutlass3xGemmM16>(
         out, a, b, std::forward<EpilogueArgs>(args)...);
-  } else if (mp2 <= 128) {
-    // m in (64, 128]
-    return cutlass_gemm_caller<Cutlass3xGemmM128>(
+  } else if (mp2 <= 64) {
+    // m in (16, 64]
+    return cutlass_gemm_caller<Cutlass3xGemmM64>(
         out, a, b, std::forward<EpilogueArgs>(args)...);
   } else if (mp2 <= 256) {
-    // m in (128, 256]
+    // m in (64, 256]
     return cutlass_gemm_caller<Cutlass3xGemmM256>(
         out, a, b, std::forward<EpilogueArgs>(args)...);
   } else {