add mxfp8_cublas recipe to mx_formats (#1831)

vkuzo · web-flow · commit 23c3162234d5 · 2025-03-05T06:45:10.000-08:00
* Update

[ghstack-poisoned]

* Update

[ghstack-poisoned]

* Update

[ghstack-poisoned]

* Update

[ghstack-poisoned]

* Update

[ghstack-poisoned]

* Update

[ghstack-poisoned]
diff --git a/test/prototype/mx_formats/test_mx_linear.py b/test/prototype/mx_formats/test_mx_linear.py
@@ -11,7 +11,10 @@
 import torch
 import torch.nn as nn
 
-from torchao.prototype.mx_formats.config import MXGemmKernelChoice, MXLinearConfig
+from torchao.prototype.mx_formats.config import (
+    MXLinearConfig,
+    MXLinearRecipeName,
+)
 from torchao.prototype.mx_formats.constants import DTYPE_FP4, SUPPORTED_ELEM_DTYPES
 from torchao.prototype.mx_formats.mx_linear import (
     MXInferenceLinear,
@@ -98,9 +101,16 @@ def test_linear_eager(elem_dtype, bias, input_shape):
 @pytest.mark.skipif(
     not is_sm_at_least_100(), reason="CUDA capability >= 10.0 required for mxfloat8"
 )
-@pytest.mark.parametrize("elem_dtype", [torch.float8_e4m3fn, DTYPE_FP4])
+@pytest.mark.parametrize(
+    "recipe_name",
+    [
+        MXLinearRecipeName.MXFP8_CUBLAS,
+        MXLinearRecipeName.MXFP8_CUTLASS,
+        MXLinearRecipeName.MXFP4_CUTLASS,
+    ],
+)
 @pytest.mark.parametrize("mkn", [(128, 256, 512), (256, 512, 128), (512, 128, 256)])
-def test_linear_eager_emulated_vs_real_gemm(elem_dtype, mkn):
+def test_linear_eager_emulated_vs_real_gemm(recipe_name, mkn):
     M, K, N = 128, 128, 128
     M, K, N = mkn
 
@@ -112,12 +122,12 @@ def test_linear_eager_emulated_vs_real_gemm(elem_dtype, mkn):
     )
     m_real = copy.deepcopy(m_emulated)
 
+    elem_dtype = torch.float8_e4m3fn
+    if recipe_name == MXLinearRecipeName.MXFP4_CUTLASS:
+        elem_dtype = DTYPE_FP4
+
     config_emulated = MXLinearConfig(block_size=32, elem_dtype=elem_dtype)
-    config_real = MXLinearConfig(
-        block_size=32,
-        elem_dtype=elem_dtype,
-        gemm_kernel_choice=MXGemmKernelChoice.CUTLASS,
-    )
+    config_real = MXLinearConfig.from_recipe_name(recipe_name)
 
     swap_linear_with_mx_linear(m_emulated, config=config_emulated)
     swap_linear_with_mx_linear(m_real, config=config_real)
diff --git a/torchao/prototype/mx_formats/README.md b/torchao/prototype/mx_formats/README.md
@@ -42,13 +42,15 @@ This is a module to do MX training, the MX matmul is currently emulated.
 ```python
 from torchao.prototype.mx_formats.mx_linear import swap_linear_with_mx_linear
 from torchao.prototype.mx_formats.config import MXLinearConfig, MXGemmKernelChoice
-from torchao.utils import is_sm_at_least_100
 
 # early prototype: on MX-enabled hardware, you can use the real MX gemm backed by
 # torchao's CUTLASS kernels. In the future, we will also add cuBLAS kernel support.
 gemm_kernel_choice = MXGemmKernelChoice.EMULATED
-if is_sm_at_least_100():
-    gemm_kernel_choice = MXGemmKernelChoice.CUTLASS
+
+# on NVIDIA Blackwell GPUs, you can also use cuBLAS or CUTLASS mxfp8 kernels
+# note: torch.compile support for both of these is WIP
+# gemm_kernel_choice = MXGemmKernelChoice.CUTLASS
+# gemm_kernel_choice = MXGemmKernelChoice.CUBLAS
 
 m = torch.nn.Sequential(torch.nn.Linear(32, 32)).cuda()
 config = MXLinearConfig(
diff --git a/torchao/prototype/mx_formats/config.py b/torchao/prototype/mx_formats/config.py
@@ -24,12 +24,16 @@ class MXGemmKernelChoice(Enum):
     # available only when CUDA capability is greater than or equal to 10.0
     CUTLASS = "cutlass"
 
-    # TODO(future PR): add cuBLAS here once we land pytorch/pytorch support
+    # available only when CUDA capability is greater than or equal to 10.0
+    # available on recent versions of PyTorch nightly, with https://github.com/pytorch/pytorch/pull/147548
+    # note: torch.compile does not work yet, see https://github.com/pytorch/pytorch/issues/147873
+    CUBLAS = "cublas"
 
 
 # Pre-made recipes for common configurations
 class MXLinearRecipeName(Enum):
     MXFP8_EMULATED = "mxfp8_emulated"
+    MXFP8_CUBLAS = "mxfp8_cublas"
     MXFP8_CUTLASS = "mxfp8_cutlass"
     MXFP4_EMULATED = "mxfp4_emulated"
     MXFP4_CUTLASS = "mxfp4_cutlass"
@@ -86,6 +90,20 @@ def __post_init__(self):
             assert (
                 self.elem_dtype_grad_output_override is None
             ), "elem_dtype_grad_output_override not supported for CUTLASS MX gemm kernels"
+        elif self.gemm_kernel_choice == MXGemmKernelChoice.CUBLAS:
+            assert (
+                self.block_size == 32
+            ), f"block_size must be 32 to use the cuBLAS MX gemm kernels, got {self.block_size}"
+            valid_dtypes = [torch.float8_e4m3fn]
+            assert (
+                self.elem_dtype in valid_dtypes
+            ), f"elem_dtype must be one of {valid_dtypes} to use the CUTLASS MX gemm kernels, got {self.elem_dtype}"
+            assert (
+                self.elem_dtype_weight_override is None
+            ), "elem_dtype_weight_override not supported for CUTLASS MX gemm kernels"
+            assert (
+                self.elem_dtype_grad_output_override is None
+            ), "elem_dtype_grad_output_override not supported for CUTLASS MX gemm kernels"
 
     @staticmethod
     def from_recipe_name(
@@ -104,11 +122,13 @@ def from_recipe_name(
 
         if recipe_name is MXLinearRecipeName.MXFP8_EMULATED:
             return MXLinearConfig()
+        elif recipe_name is MXLinearRecipeName.MXFP8_CUBLAS:
+            return MXLinearConfig(gemm_kernel_choice=MXGemmKernelChoice.CUBLAS)
         elif recipe_name is MXLinearRecipeName.MXFP8_CUTLASS:
             return MXLinearConfig(gemm_kernel_choice=MXGemmKernelChoice.CUTLASS)
         elif recipe_name is MXLinearRecipeName.MXFP4_EMULATED:
             return MXLinearConfig(elem_dtype=DTYPE_FP4)
-        elif recipe_name is MXLinearRecipeName.MXFP8_CUTLASS:
+        elif recipe_name is MXLinearRecipeName.MXFP4_CUTLASS:
             return MXLinearConfig(
                 elem_dtype=DTYPE_FP4, gemm_kernel_choice=MXGemmKernelChoice.CUTLASS
             )
diff --git a/torchao/prototype/mx_formats/mx_ops.py b/torchao/prototype/mx_formats/mx_ops.py
@@ -70,7 +70,7 @@ def mx_mm(aten_op, args, kwargs=None):
     b = args[1]
     assert isinstance(a, MXTensor) and isinstance(b, MXTensor)
     assert a._gemm_kernel_choice == b._gemm_kernel_choice, "unsupported"
-    if a._gemm_kernel_choice == MXGemmKernelChoice.CUTLASS:
+    if a._gemm_kernel_choice in (MXGemmKernelChoice.CUBLAS, MXGemmKernelChoice.CUTLASS):
         # real MX gemm backed by torchao's CUTLASS kernels
         M, K, N = a.shape[0], a.shape[1], b.shape[1]
         assert b._data.t().is_contiguous()
@@ -81,12 +81,22 @@ def mx_mm(aten_op, args, kwargs=None):
         b_scale_block = to_blocked(b_scale)
         if a._elem_dtype == torch.float8_e4m3fn:
             assert b._elem_dtype == torch.float8_e4m3fn
-            res = torchao.ops.mx_fp8_bf16(
-                a._data, b._data, a_scale_block, b_scale_block
-            )
+            if a._gemm_kernel_choice is MXGemmKernelChoice.CUBLAS:
+                res = torch._scaled_mm(
+                    a._data,
+                    b._data,
+                    a_scale_block.view(torch.float8_e8m0fnu),
+                    b_scale_block.view(torch.float8_e8m0fnu),
+                    out_dtype=torch.bfloat16,
+                )
+            else:
+                res = torchao.ops.mx_fp8_bf16(
+                    a._data, b._data, a_scale_block, b_scale_block
+                )
         else:
             assert a._elem_dtype == DTYPE_FP4
             assert b._elem_dtype == DTYPE_FP4
+            assert a._gemm_kernel_choice is MXGemmKernelChoice.CUTLASS, "unsupported"
             res = torchao.ops.mx_fp4_bf16(
                 a._data, b._data, a_scale_block, b_scale_block
             )
diff --git a/torchao/utils.py b/torchao/utils.py
@@ -614,6 +614,7 @@ def _torch_version_at_least(min_version):
 # | MI300X        | gfx940, gfx941, gfx942 |
 # | MI350         | gfx950                 |
 
+
 def is_ROCM():
     return torch.cuda.is_available() and torch.version.hip