pytorch · petrex · Apr 16, 2025 · Apr 16, 2025 · Apr 18, 2025 · Apr 24, 2025
diff --git a/torchao/prototype/mx_formats/README.md b/torchao/prototype/mx_formats/README.md
@@ -1,7 +1,8 @@
 # MX training and inference with native PyTorch
 
-This is a workflow for e2e training and inference with MX dtypes from the [MX OCP spec](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf)
-in native PyTorch.  We are currently in prototype and are actively working on optimizing these workflows on the NVIDIA B200 hardware.
+This is a workflow for e2e training and inference with MX dtypes from the [MX OCP spec](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf) 
+in native PyTorch.  We are currently in prototype and are actively working on optimizing these workflows on the NVIDIA B200 and AMD MI355x hardware.
+
 
 ## Overall status
 
@@ -29,6 +30,9 @@ from torchao.prototype.mx_formats import MXLinearConfig, MXGemmKernelChoice
 gemm_kernel_choice = MXGemmKernelChoice.CUBLAS
 # gemm_kernel_choice = MXGemmKernelChoice.CUTLASS
 
+# on AMD MI355x GPUs with ROCm 6.5+ and gfx950, you can use HIPBLASLT mxfp8 kernels
+gemm_kernel_choice = MXGemmKernelChoice.HIPBLASLT
+
 # on older NVIDIA gpus, you can run training with emulated MX gemm
 # gemm_kernel_choice = MXGemmKernelChoice.EMULATED
 
@@ -97,6 +101,8 @@ on supported hardware, you can run the following command:
 // example output: https://gist.github.com/vkuzo/a1ddb782e6e1c2aef0c726b3df99efbc
 ```
 
+On AMD MI355x GPUs with ROCm 6.5+ and gfx950, we use HIPBLASLT for mxfp8 gemm. We are actively working on optimizing the end-to-end performance for AMD hardware.
+
 ## to_mx cast across dim0 and dim1
 
 On NVIDIA B200 machines, our to_mx kernels for mxfp8 achieve **up to 5.5 TB/s** for the dim0 cast (with torch.compile),

diff --git a/torchao/prototype/mx_formats/config.py b/torchao/prototype/mx_formats/config.py
@@ -32,11 +32,15 @@ class MXGemmKernelChoice(Enum):
     # note: torch.compile does not work yet, see https://github.com/pytorch/pytorch/issues/147873
     CUBLAS = "cublas"
 
+    # available only on ROCm with HIPBLASLT support, reuqire gfx950 and ROCm 7.0
+    HIPBLASLT = "hipblaslt"
+
 
 # Pre-made recipes for common configurations
 class MXLinearRecipeName(Enum):
     MXFP8_EMULATED = "mxfp8_emulated"
     MXFP8_CUBLAS = "mxfp8_cublas"
+    MXFP8_HIPBLASLT = "mxfp8_hipblaslt"
     MXFP4_EMULATED = "mxfp4_emulated"
     MXFP4_CUTLASS = "mxfp4_cutlass"
 
@@ -64,6 +68,15 @@ def _validate_gemm_kernel_choice(gemm_kernel_choice, block_size, elem_dtype):
         assert elem_dtype in valid_dtypes, (
             f"elem_dtype must be one of {valid_dtypes} to use the CUTLASS MX gemm kernels, got {elem_dtype}"
         )
+    elif gemm_kernel_choice == MXGemmKernelChoice.HIPBLASLT:
+        assert block_size == 32, (
+            f"block_size must be 32 to use the HIPBLASLT MX gemm kernels, got {block_size}"
+        )
+        valid_dtypes = [torch.float8_e4m3fn]
+        assert elem_dtype in valid_dtypes, (
+            f"elem_dtype must be one of {valid_dtypes} to use the HIPBLASLT MX gemm kernels, got {elem_dtype}"
+        )
+        assert torch.version.hip is not None, "HIPBLASLT requires ROCm"
 
 
 @dataclass
@@ -124,6 +137,8 @@ def from_recipe_name(
             return MXLinearConfig()
         elif recipe_name is MXLinearRecipeName.MXFP8_CUBLAS:
             return MXLinearConfig(gemm_kernel_choice=MXGemmKernelChoice.CUBLAS)
+        elif recipe_name is MXLinearRecipeName.MXFP8_HIPBLASLT:
+            return MXLinearConfig(gemm_kernel_choice=MXGemmKernelChoice.HIPBLASLT)
         elif recipe_name is MXLinearRecipeName.MXFP4_EMULATED:
             return MXLinearConfig(elem_dtype=torch.float4_e2m1fn_x2)
         elif recipe_name is MXLinearRecipeName.MXFP4_CUTLASS:

diff --git a/torchao/prototype/mx_formats/mx_ops.py b/torchao/prototype/mx_formats/mx_ops.py
@@ -88,7 +88,11 @@ def _addmm_mx_dispatch(
     """
     gemm_choice = _get_gemm_choice(a._gemm_kernel_choice, b._gemm_kernel_choice)
 
-    if gemm_choice in (MXGemmKernelChoice.CUBLAS, MXGemmKernelChoice.CUTLASS):
+    if gemm_choice in (
+        MXGemmKernelChoice.CUBLAS,
+        MXGemmKernelChoice.CUTLASS,
+        MXGemmKernelChoice.HIPBLASLT,
+    ):
         # real MX gemm backed by torchao's CUTLASS kernels
         M, K, N = a.shape[0], a.shape[1], b.shape[1]
         assert a._data.is_contiguous()
@@ -103,8 +107,11 @@ def _addmm_mx_dispatch(
 
         if a._elem_dtype == torch.float8_e4m3fn:
             assert b._elem_dtype == torch.float8_e4m3fn
-            assert gemm_choice is MXGemmKernelChoice.CUBLAS, (
-                "CUBLAS is the only supported kernel choice for MX FP8 operations"
+            assert gemm_choice in (
+                MXGemmKernelChoice.CUBLAS,
+                MXGemmKernelChoice.HIPBLASLT,
+            ), (
+                "CUBLAS and HIPBLASLT are the only supported kernel choices for MX FP8 operations ATM"
             )
 
             res = torch._scaled_mm(