pytorch
diff --git a/‎fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f4f4bf16.cu
Lines changed: 113 additions & 1 deletion b/‎fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f4f4bf16.cu
Lines changed: 113 additions & 1 deletion
diff --git a/‎fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f4f4bf16/f4f4bf16_128_128_1_1_1_f.cu
Lines changed: 33 additions & 0 deletions b/‎fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f4f4bf16/f4f4bf16_128_128_1_1_1_f.cu
Lines changed: 33 additions & 0 deletions
diff --git a/‎fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f4f4bf16/f4f4bf16_128_128_1_1_1_t.cu
Lines changed: 33 additions & 0 deletions b/‎fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f4f4bf16/f4f4bf16_128_128_1_1_1_t.cu
Lines changed: 33 additions & 0 deletions
diff --git a/‎fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f4f4bf16/f4f4bf16_128_128_1_2_1_f.cu
Lines changed: 33 additions & 0 deletions b/‎fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f4f4bf16/f4f4bf16_128_128_1_2_1_f.cu
Lines changed: 33 additions & 0 deletions
diff --git a/‎fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f4f4bf16/f4f4bf16_128_128_1_2_1_t.cu
Lines changed: 33 additions & 0 deletions b/‎fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f4f4bf16/f4f4bf16_128_128_1_2_1_t.cu
Lines changed: 33 additions & 0 deletions
diff --git a/‎fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f4f4bf16/f4f4bf16_128_128_1_4_1_f.cu
Lines changed: 33 additions & 0 deletions b/‎fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f4f4bf16/f4f4bf16_128_128_1_4_1_f.cu
Lines changed: 33 additions & 0 deletions
diff --git a/‎fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f4f4bf16/f4f4bf16_128_128_1_4_1_t.cu
Lines changed: 33 additions & 0 deletions b/‎fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f4f4bf16/f4f4bf16_128_128_1_4_1_t.cu
Lines changed: 33 additions & 0 deletions
diff --git a/‎fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f4f4bf16/f4f4bf16_128_128_2_2_1_f.cu
Lines changed: 33 additions & 0 deletions b/‎fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f4f4bf16/f4f4bf16_128_128_2_2_1_f.cu
Lines changed: 33 additions & 0 deletions
@@ -27,8 +27,8 @@ at::Tensor dispatch_f4f4bf16_kernel(
     std::optional<at::Tensor> global_scale,
     bool use_mx = true) {
   auto M = XQ.size(0);
-  auto K = XQ.size(1);
   auto N = WQ.size(0);
+  auto K = XQ.size(1) * 2; // Since K is packed
   auto BLOCK_SIZE = 16;
   TORCH_CHECK(
       N % BLOCK_SIZE == 0 && K % BLOCK_SIZE == 0,
@@ -45,6 +45,62 @@ at::Tensor dispatch_f4f4bf16_kernel(
         return f4f4bf16_128_128_4_1_1_t(XQ, WQ, x_scale, w_scale, global_scale);
       }
     } else if (M <= 2048) {
+      if (M <= 256) {
+        if (N == 896) {
+          return f4f4bf16_128_128_2_2_1_t(
+              XQ, WQ, x_scale, w_scale, global_scale);
+        } else if (N == 5120) {
+          if (K == 640 || K == 5120) {
+            return f4f4bf16_128_128_4_1_1_t(
+                XQ, WQ, x_scale, w_scale, global_scale);
+          } else if ((K == 8192) || (K == 16384)) {
+            return f4f4bf16_256_128_2_2_1_t(
+                XQ, WQ, x_scale, w_scale, global_scale);
+          }
+        } else if (N == 5632) {
+          return f4f4bf16_128_192_2_2_1_t(
+              XQ, WQ, x_scale, w_scale, global_scale);
+        } else if (N == 8192) {
+          return f4f4bf16_256_128_2_2_1_t(
+              XQ, WQ, x_scale, w_scale, global_scale);
+        }
+      } else if (M <= 512) {
+        if (N == 896) {
+          return f4f4bf16_128_128_2_2_1_t(
+              XQ, WQ, x_scale, w_scale, global_scale);
+        } else if (N == 5120) {
+          return f4f4bf16_256_192_4_1_1_t(
+              XQ, WQ, x_scale, w_scale, global_scale);
+        } else if (N == 5632) {
+          return f4f4bf16_256_128_2_4_1_t(
+              XQ, WQ, x_scale, w_scale, global_scale);
+        } else if (N == 8192) {
+          return f4f4bf16_256_128_2_2_1_t(
+              XQ, WQ, x_scale, w_scale, global_scale);
+        }
+      } else if (M <= 1024) {
+        if (N == 896) {
+          return f4f4bf16_256_128_2_4_1_t(
+              XQ, WQ, x_scale, w_scale, global_scale);
+        } else if (N == 5120) {
+          if (K == 640) {
+            return f4f4bf16_128_128_1_4_1_t(
+                XQ, WQ, x_scale, w_scale, global_scale);
+          } else if (K == 5120) {
+            return f4f4bf16_128_192_4_2_1_t(
+                XQ, WQ, x_scale, w_scale, global_scale);
+          } else if (K == 5120 || K == 16384) {
+            return f4f4bf16_256_128_2_4_1_t(
+                XQ, WQ, x_scale, w_scale, global_scale);
+          }
+        } else if (N == 5632) {
+          return f4f4bf16_256_128_2_4_1_t(
+              XQ, WQ, x_scale, w_scale, global_scale);
+        } else if (N == 8192) {
+          return f4f4bf16_256_256_4_1_1_t(
+              XQ, WQ, x_scale, w_scale, global_scale);
+        }
+      }
       if (N <= 2048) {
         return f4f4bf16_256_128_2_2_1_t(XQ, WQ, x_scale, w_scale, global_scale);
       } else if (N <= 8192) {
@@ -111,6 +167,62 @@ at::Tensor dispatch_f4f4bf16_kernel(
         return f4f4bf16_128_128_4_1_1_f(XQ, WQ, x_scale, w_scale, global_scale);
       }
     } else if (M <= 2048) {
+      if (M <= 256) {
+        if (N == 896) {
+          return f4f4bf16_128_128_2_2_1_f(
+              XQ, WQ, x_scale, w_scale, global_scale);
+        } else if (N == 5120) {
+          if (K == 640 || K == 5120) {
+            return f4f4bf16_128_128_4_1_1_f(
+                XQ, WQ, x_scale, w_scale, global_scale);
+          } else if ((K == 8192) || (K == 16384)) {
+            return f4f4bf16_256_128_2_2_1_f(
+                XQ, WQ, x_scale, w_scale, global_scale);
+          }
+        } else if (N == 5632) {
+          return f4f4bf16_128_192_2_2_1_f(
+              XQ, WQ, x_scale, w_scale, global_scale);
+        } else if (N == 8192 || N == 16384) {
+          return f4f4bf16_256_128_2_2_1_f(
+              XQ, WQ, x_scale, w_scale, global_scale);
+        }
+      } else if (M <= 512) {
+        if (N == 896) {
+          return f4f4bf16_128_128_2_2_1_f(
+              XQ, WQ, x_scale, w_scale, global_scale);
+        } else if (N == 5120) {
+          return f4f4bf16_256_192_4_1_1_f(
+              XQ, WQ, x_scale, w_scale, global_scale);
+        } else if (N == 5632) {
+          return f4f4bf16_256_128_2_4_1_f(
+              XQ, WQ, x_scale, w_scale, global_scale);
+        } else if (N == 8192) {
+          return f4f4bf16_256_128_2_2_1_f(
+              XQ, WQ, x_scale, w_scale, global_scale);
+        }
+      } else if (M <= 1024) {
+        if (N == 896) {
+          return f4f4bf16_256_128_2_4_1_f(
+              XQ, WQ, x_scale, w_scale, global_scale);
+        } else if (N == 5120) {
+          if (K == 640) {
+            return f4f4bf16_128_128_1_4_1_f(
+                XQ, WQ, x_scale, w_scale, global_scale);
+          } else if (K == 5120) {
+            return f4f4bf16_128_192_4_2_1_f(
+                XQ, WQ, x_scale, w_scale, global_scale);
+          } else if (K == 5120 || K == 16384) {
+            return f4f4bf16_256_128_2_4_1_f(
+                XQ, WQ, x_scale, w_scale, global_scale);
+          }
+        } else if (N == 5632) {
+          return f4f4bf16_256_128_2_4_1_f(
+              XQ, WQ, x_scale, w_scale, global_scale);
+        } else if (N == 8192) {
+          return f4f4bf16_256_256_4_1_1_f(
+              XQ, WQ, x_scale, w_scale, global_scale);
+        }
+      }
       if (N <= 2048) {
         return f4f4bf16_256_128_2_2_1_f(XQ, WQ, x_scale, w_scale, global_scale);
       } else if (N <= 8192) {
 
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "f4f4bf16_common.cuh"
+
+namespace fbgemm_gpu {
+
+#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12080)
+
+at::Tensor f4f4bf16_128_128_1_1_1_f(
+    at::Tensor XQ, // FP4
+    at::Tensor WQ, // FP4
+    at::Tensor x_scale,
+    at::Tensor w_scale,
+    std::optional<at::Tensor> global_scale = std::nullopt) {
+  // Dispatch this kernel to the correct underlying implementation.
+  return _f4f4bf16<
+      cutlass::nv_float4_t<cutlass::float_e2m1_t>,
+      128,
+      128,
+      1,
+      1,
+      1>(XQ, WQ, x_scale, w_scale, global_scale);
+}
+
+#endif
+
+} // namespace fbgemm_gpu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "f4f4bf16_common.cuh"
+
+namespace fbgemm_gpu {
+
+#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12080)
+
+at::Tensor f4f4bf16_128_128_1_1_1_t(
+    at::Tensor XQ, // FP4
+    at::Tensor WQ, // FP4
+    at::Tensor x_scale,
+    at::Tensor w_scale,
+    std::optional<at::Tensor> global_scale = std::nullopt) {
+  // Dispatch this kernel to the correct underlying implementation.
+  return _f4f4bf16<
+      cutlass::mx_float4_t<cutlass::float_e2m1_t>,
+      128,
+      128,
+      1,
+      1,
+      1>(XQ, WQ, x_scale, w_scale, global_scale);
+}
+
+#endif
+
+} // namespace fbgemm_gpu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "f4f4bf16_common.cuh"
+
+namespace fbgemm_gpu {
+
+#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12080)
+
+at::Tensor f4f4bf16_128_128_1_2_1_f(
+    at::Tensor XQ, // FP4
+    at::Tensor WQ, // FP4
+    at::Tensor x_scale,
+    at::Tensor w_scale,
+    std::optional<at::Tensor> global_scale = std::nullopt) {
+  // Dispatch this kernel to the correct underlying implementation.
+  return _f4f4bf16<
+      cutlass::nv_float4_t<cutlass::float_e2m1_t>,
+      128,
+      128,
+      1,
+      2,
+      1>(XQ, WQ, x_scale, w_scale, global_scale);
+}
+
+#endif
+
+} // namespace fbgemm_gpu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "f4f4bf16_common.cuh"
+
+namespace fbgemm_gpu {
+
+#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12080)
+
+at::Tensor f4f4bf16_128_128_1_2_1_t(
+    at::Tensor XQ, // FP4
+    at::Tensor WQ, // FP4
+    at::Tensor x_scale,
+    at::Tensor w_scale,
+    std::optional<at::Tensor> global_scale = std::nullopt) {
+  // Dispatch this kernel to the correct underlying implementation.
+  return _f4f4bf16<
+      cutlass::mx_float4_t<cutlass::float_e2m1_t>,
+      128,
+      128,
+      1,
+      2,
+      1>(XQ, WQ, x_scale, w_scale, global_scale);
+}
+
+#endif
+
+} // namespace fbgemm_gpu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "f4f4bf16_common.cuh"
+
+namespace fbgemm_gpu {
+
+#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12080)
+
+at::Tensor f4f4bf16_128_128_1_4_1_f(
+    at::Tensor XQ, // FP4
+    at::Tensor WQ, // FP4
+    at::Tensor x_scale,
+    at::Tensor w_scale,
+    std::optional<at::Tensor> global_scale = std::nullopt) {
+  // Dispatch this kernel to the correct underlying implementation.
+  return _f4f4bf16<
+      cutlass::nv_float4_t<cutlass::float_e2m1_t>,
+      128,
+      128,
+      1,
+      4,
+      1>(XQ, WQ, x_scale, w_scale, global_scale);
+}
+
+#endif
+
+} // namespace fbgemm_gpu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "f4f4bf16_common.cuh"
+
+namespace fbgemm_gpu {
+
+#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12080)
+
+at::Tensor f4f4bf16_128_128_1_4_1_t(
+    at::Tensor XQ, // FP4
+    at::Tensor WQ, // FP4
+    at::Tensor x_scale,
+    at::Tensor w_scale,
+    std::optional<at::Tensor> global_scale = std::nullopt) {
+  // Dispatch this kernel to the correct underlying implementation.
+  return _f4f4bf16<
+      cutlass::mx_float4_t<cutlass::float_e2m1_t>,
+      128,
+      128,
+      1,
+      4,
+      1>(XQ, WQ, x_scale, w_scale, global_scale);
+}
+
+#endif
+
+} // namespace fbgemm_gpu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "f4f4bf16_common.cuh"
+
+namespace fbgemm_gpu {
+
+#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12080)
+
+at::Tensor f4f4bf16_128_128_2_2_1_f(
+    at::Tensor XQ, // FP4
+    at::Tensor WQ, // FP4
+    at::Tensor x_scale,
+    at::Tensor w_scale,
+    std::optional<at::Tensor> global_scale = std::nullopt) {
+  // Dispatch this kernel to the correct underlying implementation.
+  return _f4f4bf16<
+      cutlass::nv_float4_t<cutlass::float_e2m1_t>,
+      128,
+      128,
+      2,
+      2,
+      1>(XQ, WQ, x_scale, w_scale, global_scale);
+}
+
+#endif
+
+} // namespace fbgemm_gpu