Make more kernels support 3D inputs. (#4476)

jwfromm · facebook-github-bot · commit 76c16e595d2a · 2025-07-11T19:43:10.000-07:00
Summary: X-link: facebookresearch/FBGEMM#1533 Pull Request resolved: #4476 Some of our kernels still require X to be 2D, but there are occasionally cases where we'd like it to include a batch dimension. This diff patches up support for such caes across a few kernels. Reviewed By: jerryzh168, jiawenliu64 Differential Revision: D78171092 fbshipit-source-id: 9d3cdd71da2b0b0b9ff3fa6e67700dc331c82c34
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fp8_blockwise_gemm.hip b/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fp8_blockwise_gemm.hip
@@ -59,12 +59,16 @@ at::Tensor f8f8bf16_blockwise_impl(
   TORCH_CHECK(WQ.is_cuda() && WQ.is_contiguous());
 
   // Get input information.
-  int M = XQ.size(0);
-  int N = WQ.size(0);
-  int K = XQ.size(1);
+  int M = size_to_dim_(XQ.dim() - 1, XQ.sizes());
+  int K = XQ.size(-1);
+  int N = size_to_dim_(WQ.dim() - 1, WQ.sizes());
+  // 1. If the input tensor is {M, K}, the output tensor is {M, N}.
+  // 2. If the input tensor is {b, M, K}, the output tensor is {b, M, N}.
+  auto out_sizes = XQ.sizes().vec();
+  out_sizes.back() = N;
 
   // Create output tensor.
-  auto Y = at::empty({M, N}, XQ.options().dtype(at::kBFloat16));
+  auto Y = at::empty(out_sizes, XQ.options().dtype(at::kBFloat16));
   // If inputs are empty return an empty tensor.
   if (M == 0 || N == 0 || K == 0) {
     return Y;
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fp8_tensorwise_gemm.hip b/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fp8_tensorwise_gemm.hip
@@ -62,18 +62,27 @@ template <
 at::Tensor
 f8f8bf16_tensorwise_impl(at::Tensor XQ, at::Tensor WQ, double scale) {
   // Get input information.
-  int M = XQ.size(0);
-  int N = WQ.size(0);
-  int K = XQ.size(1);
+  int M = size_to_dim_(XQ.dim() - 1, XQ.sizes());
+  int K = XQ.size(-1);
+  int N = size_to_dim_(WQ.dim() - 1, WQ.sizes());
+  // 1. If the input tensor is {M, K}, the output tensor is {M, N}.
+  // 2. If the input tensor is {b, M, K}, the output tensor is {b, M, N}.
+  auto out_sizes = XQ.sizes().vec();
+  out_sizes.back() = N;
 
   int StrideA = K;
   int StrideB = K;
   int StrideC = N;
 
+  // Handle case where inputs are empty.
+  if (M == 0 || N == 0 || K == 0) {
+    return at::zeros(out_sizes, XQ.options().dtype(at::kBFloat16));
+  }
+
   TORCH_CHECK(XQ.is_cuda() && XQ.is_contiguous());
   TORCH_CHECK(WQ.is_cuda() && WQ.is_contiguous());
 
-  auto Y = at::empty({M, N}, XQ.options().dtype(at::kBFloat16));
+  auto Y = at::empty(out_sizes, XQ.options().dtype(at::kBFloat16));
 
   using ADataType = ck::f8_t;
   using BDataType = ck::f8_t;
@@ -185,9 +194,9 @@ f8f8bf16_tensorwise_impl(at::Tensor XQ, at::Tensor WQ, double scale) {
 enum class KernelMode { Small, Large, Default };
 
 std::tuple<KernelMode, bool> get_kernel_mode(at::Tensor XQ, at::Tensor WQ) {
-  auto M = XQ.size(0);
-  auto K = XQ.size(1);
-  auto N = WQ.size(0);
+  int M = size_to_dim_(XQ.dim() - 1, XQ.sizes());
+  int K = XQ.size(-1);
+  int N = size_to_dim_(WQ.dim() - 1, WQ.sizes());
   // Use small kernel when input matrices are small.
   bool use_small_kernel = (M <= 512 && N <= 512) || (M <= 128) || (N <= 128);
   // For larger workloads, specialize to large gemm.
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16.cu
@@ -40,8 +40,8 @@ at::Tensor f8f8bf16_impl(
   // WQ: N x K
   // output: M x N
   int M = size_to_dim_(XQ.dim() - 1, XQ.sizes());
-  int N = WQ.size(0);
-  int K = WQ.size(1);
+  int K = XQ.size(-1);
+  int N = size_to_dim_(WQ.dim() - 1, WQ.sizes());
   // 1. If the input tensor is {M, K}, the output tensor is {M, N}.
   // 2. If the input tensor is {b, M, K}, the output tensor is {b, M, N}.
   auto out_sizes = XQ.sizes().vec();
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_tensorwise.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_tensorwise.cu
@@ -42,8 +42,8 @@ at::Tensor f8f8bf16_tensorwise_impl(
   // WQ: N x K
   // output: M x N
   int M = size_to_dim_(XQ.dim() - 1, XQ.sizes());
-  int N = WQ.size(0);
-  int K = WQ.size(1);
+  int K = XQ.size(-1);
+  int N = size_to_dim_(WQ.dim() - 1, WQ.sizes());
   // 1. If the input tensor is {M, K}, the output tensor is {M, N}.
   // 2. If the input tensor is {b, M, K}, the output tensor is {b, M, N}.
   auto out_sizes = XQ.sizes().vec();
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/include/kernel_mode.h b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/include/kernel_mode.h
@@ -15,9 +15,9 @@ namespace fbgemm_gpu {
 enum class KernelMode { Small, Medium, Large, Default };
 
 inline KernelMode get_kernel_mode(at::Tensor XQ, at::Tensor WQ) {
-  auto M = XQ.size(0);
-  auto K = XQ.size(1);
-  auto N = WQ.size(0);
+  int M = size_to_dim_(XQ.dim() - 1, XQ.sizes());
+  int K = XQ.size(-1);
+  int N = size_to_dim_(WQ.dim() - 1, WQ.sizes());
   // Use a large kernel if at least two shapes are large....
   bool use_large_kernel =
       ((M >= 2048 && K >= 2048) || (M >= 2048 && N >= 2048) ||
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/quantize.cpp b/fbgemm_gpu/experimental/gen_ai/src/quantize/quantize.cpp
@@ -489,9 +489,22 @@ at::Tensor f8f8bf16_blockwise_meta(
     int64_t /* block_m = 128*/,
     int64_t /* block_n = 128*/,
     int64_t /* block_k = 128*/) {
-  const at::SymInt M = XQ.sym_size(0);
-  const at::SymInt N = WQ.sym_size(0);
-  auto Y = at::empty_symint({M, N}, XQ.options().dtype(at::kBFloat16));
+  int64_t x_dims = XQ.dim();
+  int64_t w_dims = WQ.dim();
+  TORCH_CHECK(
+      (x_dims == 2 || x_dims == 3) && (w_dims == 2),
+      "The dim of XQ must be 2 or 3, and dim of WQ must be 2");
+  at::Tensor Y;
+  if (x_dims == 2) {
+    const at::SymInt M = XQ.sym_size(0);
+    const at::SymInt N = WQ.sym_size(0);
+    Y = at::empty_symint({M, N}, XQ.options().dtype(at::kBFloat16));
+  } else {
+    const at::SymInt B = XQ.sym_size(0);
+    const at::SymInt M = XQ.sym_size(1);
+    const at::SymInt N = WQ.sym_size(0);
+    Y = at::empty_symint({B, M, N}, XQ.options().dtype(at::kBFloat16));
+  }
   return Y;
 }
 
@@ -575,13 +588,26 @@ at::Tensor fp8fp8bf16_fast_gemv_meta(
 }
 
 at::Tensor f8f8bf16_tensorwise_meta(
-    at::Tensor X,
-    at::Tensor W,
-    double scale,
-    bool use_fast_accum = true) {
-  const at::SymInt M = X.sym_size(0);
-  const at::SymInt N = W.sym_size(0);
-  auto Y = at::empty_symint({M, N}, X.options().dtype(at::kBFloat16));
+    at::Tensor XQ,
+    at::Tensor WQ,
+    double /* scale */,
+    bool /* use_fast_accum = true */) {
+  int64_t x_dims = XQ.dim();
+  int64_t w_dims = WQ.dim();
+  TORCH_CHECK(
+      (x_dims == 2 || x_dims == 3) && (w_dims == 2),
+      "The dim of XQ must be 2 or 3, and dim of WQ must be 2");
+  at::Tensor Y;
+  if (x_dims == 2) {
+    const at::SymInt M = XQ.sym_size(0);
+    const at::SymInt N = WQ.sym_size(0);
+    Y = at::empty_symint({M, N}, XQ.options().dtype(at::kBFloat16));
+  } else {
+    const at::SymInt B = XQ.sym_size(0);
+    const at::SymInt M = XQ.sym_size(1);
+    const at::SymInt N = WQ.sym_size(0);
+    Y = at::empty_symint({B, M, N}, XQ.options().dtype(at::kBFloat16));
+  }
   return Y;
 }
 
@@ -595,12 +621,25 @@ at::Tensor f8f8bf16_lite_meta(at::Tensor X, at::Tensor W, at::Tensor scale) {
 at::Tensor f8i4bf16_rowwise_meta(
     at::Tensor XQ, // FP8
     at::Tensor WQ, // INT4
-    at::Tensor x_scale,
-    at::Tensor w_scale,
-    at::Tensor w_zp) {
-  const at::SymInt M = XQ.sym_size(0);
-  const at::SymInt N = WQ.sym_size(0);
-  auto Y = at::empty_symint({M, N}, XQ.options().dtype(at::kBFloat16));
+    at::Tensor /* x_scale */,
+    at::Tensor /* w_scale */,
+    at::Tensor /* w_zp */) {
+  int64_t x_dims = XQ.dim();
+  int64_t w_dims = WQ.dim();
+  TORCH_CHECK(
+      (x_dims == 2 || x_dims == 3) && (w_dims == 2),
+      "The dim of X must be 2 or 3, and dim of W must be 2");
+  at::Tensor Y;
+  if (x_dims == 2) {
+    const at::SymInt M = XQ.sym_size(0);
+    const at::SymInt N = WQ.sym_size(0);
+    Y = at::empty_symint({M, N}, XQ.options().dtype(at::kBFloat16));
+  } else {
+    const at::SymInt B = XQ.sym_size(0);
+    const at::SymInt M = XQ.sym_size(1);
+    const at::SymInt N = WQ.sym_size(0);
+    Y = at::empty_symint({B, M, N}, XQ.options().dtype(at::kBFloat16));
+  }
   return Y;
 }
 
@@ -632,9 +671,22 @@ at::Tensor bf16i4bf16_rowwise_meta(
     at::Tensor /*  w_scale_group */,
     at::Tensor /* w_zero_group */
 ) {
-  const at::SymInt M = X.sym_size(0);
-  const at::SymInt N = W.sym_size(0);
-  auto Y = at::empty_symint({M, N}, X.options().dtype(at::kBFloat16));
+  int64_t x_dims = X.dim();
+  int64_t w_dims = W.dim();
+  TORCH_CHECK(
+      (x_dims == 2 || x_dims == 3) && (w_dims == 2),
+      "The dim of XQ must be 2 or 3, and dim of WQ must be 2");
+  at::Tensor Y;
+  if (x_dims == 2) {
+    const at::SymInt M = X.sym_size(0);
+    const at::SymInt N = W.sym_size(0);
+    Y = at::empty_symint({M, N}, X.options().dtype(at::kBFloat16));
+  } else {
+    const at::SymInt B = X.sym_size(0);
+    const at::SymInt M = X.sym_size(1);
+    const at::SymInt N = W.sym_size(0);
+    Y = at::empty_symint({B, M, N}, X.options().dtype(at::kBFloat16));
+  }
   return Y;
 }
 
diff --git a/fbgemm_gpu/experimental/gen_ai/test/quantize/quantize_test.py b/fbgemm_gpu/experimental/gen_ai/test/quantize/quantize_test.py
@@ -300,7 +300,7 @@ def test_quantize_fp8_matmul(
         if torch.version.hip:
             UseFastAccum = True
         # Setup input shapes.
-        if InputMultiDim and not torch.version.hip:
+        if InputMultiDim:
             x = (
                 torch.randn(
                     size=(3, B_T, D),