pytorch
diff --git a/‎fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py
Lines changed: 25 additions & 7 deletions b/‎fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py
Lines changed: 25 additions & 7 deletions
diff --git a/‎fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py
Lines changed: 42 additions & 0 deletions b/‎fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py
Lines changed: 42 additions & 0 deletions
diff --git a/‎fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_groupwise.cu
Lines changed: 117 additions & 0 deletions b/‎fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_groupwise.cu
Lines changed: 117 additions & 0 deletions
diff --git a/‎fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_groupwise/f8f8bf16_groupwise_128_128_128_1_2_1_9_f.cu
Lines changed: 23 additions & 0 deletions b/‎fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_groupwise/f8f8bf16_groupwise_128_128_128_1_2_1_9_f.cu
Lines changed: 23 additions & 0 deletions
diff --git a/‎fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_groupwise/f8f8bf16_groupwise_128_16_128_1_1_1_9_t.cu
Lines changed: 23 additions & 0 deletions b/‎fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_groupwise/f8f8bf16_groupwise_128_16_128_1_1_1_9_t.cu
Lines changed: 23 additions & 0 deletions
@@ -3018,6 +3018,7 @@ def _kernel_quantize_fp8_block(
     CLAMP_MAX: tl.constexpr,
     BLOCK_M: tl.constexpr,
     BLOCK_K: tl.constexpr,
+    K_MAJOR: tl.constexpr,
 ) -> None:
     """Quantize and scale each [BLOCK_M, BLOCK_K] block.
 
@@ -3047,6 +3048,7 @@ def _kernel_quantize_fp8_block(
         CLAMP_MAX (bool): Whether to apply scale_ub.
         BLOCK_M (int): Block size for M dimension of A_scale and kernel.
         BLOCK_K (int): Block size for K dimension of A_scale and kernel.
+        K_MAJOR (bool): Whether output scales should be K major (True) or MN major (False).
     """
     pid = tl.program_id(0)
     grid_k = tl.cdiv(K, BLOCK_K)
@@ -3068,9 +3070,12 @@ def _kernel_quantize_fp8_block(
         block_max = tl.maximum(block_max, EPS)
     scale = MAX_FP8 / block_max
 
-    tl.store(
-        A_scale + block_m * stride_a_scale_m + block_k * stride_a_scale_k, 1.0 / scale
-    )
+    # Write in transposed order if specified.
+    if K_MAJOR:
+        scale_offset = block_m * stride_a_scale_m + block_k * stride_a_scale_k
+    else:
+        scale_offset = block_k * stride_a_scale_m + block_m * stride_a_scale_k
+    tl.store(A_scale + scale_offset, 1.0 / scale)
     a_fp8 = a_block * scale
     # Clamp A to fp8 range to make sure there's no overflow.
     # This is required for AMD. Nvidia's default saturation
@@ -3085,6 +3090,7 @@ def triton_quantize_fp8_block(
     block_m: int = 256,
     block_k: int = 256,
     scale_ub: Optional[torch.Tensor] = None,
+    K_major: bool = True,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     """
     Quantize a tensor to fp8 with block-wise scalings.
@@ -3096,10 +3102,12 @@ def triton_quantize_fp8_block(
         block_m (int): Block size for M dimension of scale.
         block_k (int): Block size for K dimension of scale.
         scale_ub: Maximum allowed value for scale.
+        K_major (bool): Whether output scales should be K major (True) or MN major (False).
 
     Returns:
         torch.Tensor : [M, K] fp8 scaled tensor.
-        torch.Tensor: [cdiv(M, block_m), cdiv(K, block_k)] reciprocal scale tensor per block.
+        torch.Tensor: [cdiv(M, block_m), cdiv(K, block_k)] reciprocal scale tensor per block
+        if K_major is True, otherwise [cdiv(K, block_k), cdiv(M, block_M)].
     """
     assert x.device != torch.device(
         "cpu"
@@ -3111,7 +3119,10 @@ def triton_quantize_fp8_block(
     M, K = x.shape
     grid_m = triton.cdiv(M, block_m)
     grid_k = triton.cdiv(K, block_k)
-    x_scale = torch.empty((grid_m, grid_k), device=x.device, dtype=torch.float32)
+    if K_major:
+        x_scale = torch.empty((grid_m, grid_k), device=x.device, dtype=torch.float32)
+    else:
+        x_scale = torch.ones((grid_k, grid_m), device=x.device, dtype=torch.float32)
     x_fp8 = torch.empty((M, K), device=x.device, dtype=pt_dtype)
 
     _kernel_quantize_fp8_block[(grid_m * grid_k,)](
@@ -3139,6 +3150,8 @@ def triton_quantize_fp8_block(
         BLOCK_M=block_m,
         # pyre-ignore[6]: Incompatible parameter type [6]
         BLOCK_K=block_k,
+        # pyre-ignore[6]: Incompatible parameter type [6]
+        K_MAJOR=K_major,
     )
 
     return x_fp8.view(x_shape), x_scale
@@ -3151,6 +3164,7 @@ def quantize_fp8_block(
     scale_ub: Optional[torch.Tensor] = None,
     use_triton: bool = True,
     output_device: Optional[torch.device] = None,
+    K_major: bool = True,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     """
     Quantize a tensor to fp8 with block-wise scalings and optionally move to output device.
@@ -3164,18 +3178,20 @@ def quantize_fp8_block(
         scale_ub: Maximum allowed value for scale.
         use_triton (bool): Whether to use triton kernel or pytorch.
         output_device (torch.device): Device to optionally move the scaled tensors to.
+        K_major (bool): Whether output scales should be K major (True) or MN major (False).
 
     Returns:
         torch.Tensor: [M, K] fp8 scaled tensor.
-        torch.Tensor: [cdiv(M, block_m), cdiv(K, block_k)] reciprocal scale tensor per block.
+        torch.Tensor: [cdiv(M, block_m), cdiv(K, block_k)] reciprocal scale tensor per block
+        if K_major is True, otherwise [cdiv(K, block_k), cdiv(M, block_M)].
     """
     x_shape = x.shape
     x = x.view(-1, x.size(-1))
     if x.device == torch.device("cpu"):
         logger.info("Triton does not support cpu, falling back to torch ops.")
         use_triton = False
     if use_triton:
-        xq, x_scale = triton_quantize_fp8_block(x, block_m, block_k, scale_ub)
+        xq, x_scale = triton_quantize_fp8_block(x, block_m, block_k, scale_ub, K_major)
         return xq.view(x_shape), x_scale
     # else use pytorch implementation.
     if not output_device:
@@ -3219,6 +3235,8 @@ def quantize_fp8_block(
     x_fp8 = x_fp8.to(device=output_device, dtype=pt_dtype)
     x_scale = x_scale.to(output_device)  # pyre-ignore
     del x, x_padded
+    if not K_major:
+        x_scale = x_scale.t().contiguous()
     return x_fp8.view(x_shape), 1 / x_scale  # pyre-ignore
 
 
 
@@ -1490,6 +1490,48 @@ def cuda(self) -> bool:
         return True
 
 
+@register_quantize_op
+class FP8CutlassGroupwiseGemm(QuantizeOpBase):
+    """
+    FP8 matmul with group / block scaling.
+    """
+
+    def preprocess(self, x, w):
+        # Quantize weights.
+        # Scale is expected to be in [K, N] layout (N Major).
+        wq, w_scale = quantize_fp8_block(w, block_m=128, block_k=128, K_major=False)
+        # Return processed tensors.
+        return x, wq, w_scale
+
+    def quantize(self, x, wq, w_scale):
+        # Scale is expected to be in [K, M] layout (M Major).
+        xq, x_scale = quantize_fp8_block(x, block_m=1, block_k=128, K_major=False)
+        # Pretranspose scales to deepgemm format.
+        return xq, wq, x_scale, w_scale
+
+    def compute(self, xq, wq, x_scale, w_scale):
+        return torch.ops.fbgemm.f8f8bf16_groupwise(xq, wq, x_scale, w_scale)
+
+    def quantize_and_compute(self, x, wq, w_scale):
+        xq, wq, x_scale, w_scale = self.quantize(x, wq, w_scale)
+        return self.compute(xq, wq, x_scale, w_scale)
+
+    @property
+    def name(self) -> str:
+        if torch.version.cuda:
+            return "cutlass_groupwise"
+        else:
+            return "ck_groupwise"
+
+    @property
+    def hip(self) -> bool:
+        return False
+
+    @property
+    def cuda(self) -> bool:
+        return True
+
+
 ####################################################################################################
 # CUTLASS kernel v2
 ####################################################################################################
 
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+// clang-format on
+
+#include "f8f8bf16_groupwise/f8f8bf16_groupwise_manifest.cuh"
+#include "fbgemm_gpu/quantize/tuning_cache.hpp"
+#include "fbgemm_gpu/quantize/utils.h"
+
+namespace fbgemm_gpu {
+
+#if CUDART_VERSION >= 12000
+
+// FP8 Groupwise Cutlass kernel dispatch.
+Kernel_f8f8bf16_groupwise
+get_kernel_via_heuristic(int arch, int M, int N, int K) {
+  // Use shape heuristics to dispatch to optimized kernel configuration.
+  // Initial enablement includes only one schedule.
+  if (M <= 16) {
+    return f8f8bf16_groupwise_128_16_128_1_1_1_9_t;
+  } else {
+    return f8f8bf16_groupwise_128_128_128_1_2_1_9_f;
+  }
+}
+
+Kernel_f8f8bf16_groupwise get_kernel_via_tuning(
+    int arch,
+    int M,
+    int N,
+    int K,
+    at::Tensor XQ,
+    at::Tensor WQ,
+    at::Tensor x_scale,
+    at::Tensor w_scale) {
+  // One cache per kernel type
+  static TuningCache cache("f8f8bf16_groupwise");
+
+  // Reducing amount of auto tuning by rounding up M to next power of 2.
+  M = nextPowerOf2(M);
+  // Use (M, N, K) shape as the key.
+  const std::string shape_key =
+      std::to_string(M) + "_" + std::to_string(N) + "_" + std::to_string(K);
+  const auto& kernels = get_f8f8bf16_groupwise_kernels(arch);
+  auto kernel = cache.findBestKernelMaybeAutotune(
+      shape_key, kernels, XQ, WQ, x_scale, w_scale);
+  return kernel;
+}
+
+// FP8 Rowwise Cutlass kernel dispatch.
+at::Tensor dispatch_fp8_groupwise_kernel(
+    at::Tensor XQ,
+    at::Tensor WQ,
+    at::Tensor x_scale,
+    at::Tensor w_scale) {
+  int M = size_to_dim_(XQ.dim() - 1, XQ.sizes());
+  int N = size_to_dim_(WQ.dim() - 1, WQ.sizes());
+  int K = XQ.size(-1);
+
+  static int arch = -1;
+  // Avoid expensive cudaGetDeviceProperties call.
+  if (arch < 0) {
+    cudaDeviceProp prop;
+    cudaGetDeviceProperties(&prop, 0);
+    if (prop.major >= 10) {
+      arch = 10;
+      int runtimeVersion;
+      C10_CUDA_CHECK(cudaRuntimeGetVersion(&runtimeVersion));
+      TORCH_CHECK(
+          runtimeVersion >= 12080,
+          "FP8 GEMM on sm100a or above requires cuda >= 12.8");
+    } else {
+      arch = 9;
+    }
+  }
+
+  // Select kernel to run via heuristics or tuning.
+  auto kernel = [&]() {
+    if (std::getenv("FBGEMM_AUTOTUNE_ENABLE")) {
+      return get_kernel_via_tuning(arch, M, N, K, XQ, WQ, x_scale, w_scale);
+    } else {
+      return get_kernel_via_heuristic(arch, M, N, K);
+    }
+  }();
+  // Invoke kernel
+  return kernel(XQ, WQ, x_scale, w_scale);
+}
+
+at::Tensor f8f8bf16_groupwise(
+    at::Tensor XQ, // FP8
+    at::Tensor WQ, // FP8
+    at::Tensor x_scale,
+    at::Tensor w_scale) {
+  // Invoke and return rowwise kernel without output argument.
+  return dispatch_fp8_groupwise_kernel(XQ, WQ, x_scale, w_scale);
+}
+
+#else
+
+at::Tensor f8f8bf16_groupwise(
+    at::Tensor XQ, // FP8
+    at::Tensor WQ, // FP8
+    at::Tensor x_scale,
+    at::Tensor w_scale) {
+  throw std::runtime_error(
+      "CUDA version is older than 12.0"); // requires CUDA>=12
+}
+#endif
+
+} // namespace fbgemm_gpu
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "f8f8bf16_groupwise_common.cuh"
+
+namespace fbgemm_gpu {
+
+at::Tensor f8f8bf16_groupwise_128_128_128_1_2_1_9_f(
+    at::Tensor XQ,
+    at::Tensor WQ,
+    at::Tensor x_scale,
+    at::Tensor w_scale) {
+  // Dispatch this kernel to the correct underlying implementation.
+  return f8f8bf16_groupwise_wrapper<128, 128, 128, 1, 2, 1, 9, false>(
+      XQ, WQ, x_scale, w_scale);
+}
+
+} // namespace fbgemm_gpu
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "f8f8bf16_groupwise_common.cuh"
+
+namespace fbgemm_gpu {
+
+at::Tensor f8f8bf16_groupwise_128_16_128_1_1_1_9_t(
+    at::Tensor XQ,
+    at::Tensor WQ,
+    at::Tensor x_scale,
+    at::Tensor w_scale) {
+  // Dispatch this kernel to the correct underlying implementation.
+  return f8f8bf16_groupwise_wrapper<128, 16, 128, 1, 1, 1, 9, true>(
+      XQ, WQ, x_scale, w_scale);
+}
+
+} // namespace fbgemm_gpu