pytorch
diff --git a/‎benchmarks/mx_formats/cast_bench.py
Lines changed: 58 additions & 8 deletions b/‎benchmarks/mx_formats/cast_bench.py
Lines changed: 58 additions & 8 deletions
diff --git a/‎test/prototype/mx_formats/test_kernels.py
Lines changed: 105 additions & 1 deletion b/‎test/prototype/mx_formats/test_kernels.py
Lines changed: 105 additions & 1 deletion
diff --git a/‎torchao/prototype/mx_formats/kernels.py
Lines changed: 10 additions & 4 deletions b/‎torchao/prototype/mx_formats/kernels.py
Lines changed: 10 additions & 4 deletions
@@ -4,18 +4,26 @@
 # This source code is licensed under the BSD 3-Clause license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import Callable, Tuple
+from functools import partial
+from typing import Tuple
 
 import fire
 import torch
 import triton
-from torch._inductor.utils import do_bench_using_profiling
+from triton.testing import do_bench
 
 from torchao.prototype.mx_formats.kernels import (
     triton_to_mxfp8_dim1,
 )
 from torchao.prototype.mx_formats.mx_tensor import to_mx
 
+try:
+    import mxfp8_cuda
+except ImportError:
+    print(
+        "Warning: mxfp8_cuda extension not found or ready. Benchmarks using this will not be able to run."
+    )
+
 torch.manual_seed(0)
 
 bytes_per_el_bf16 = 2
@@ -64,11 +72,8 @@ def to_mx_dim1_reference(x_hp, block_size):
     return data_d1.t(), scale_d1
 
 
-def benchmark_cuda_function_in_microseconds(func: Callable, *args, **kwargs) -> float:
-    """Thin wrapper around do_bench_using_profiling"""
-    no_args = lambda: func(*args, **kwargs)
-    time = do_bench_using_profiling(no_args)
-    return time * 1e3
+def benchmark_cuda_function_in_microseconds(f, *args):
+    return do_bench(lambda: f(*args), return_mode="median") * 1e3
 
 
 def run(
@@ -82,7 +87,16 @@ def run(
     print(f"torch version: {torch.__version__}")
     print(f"triton version: {triton.__version__}")
     print(f"mode: {mode}")
-    assert mode in ("dim0", "dim1", "dim0_dim1", "dim0_mx", "dim1_mx", "dim1_mx_triton")
+    assert mode in (
+        "dim0",
+        "dim1",
+        "dim0_dim1",
+        "dim0_mx",
+        "dim1_mx",
+        "dim1_mx_triton",
+        "dim1_mx_cuda_floor",
+        "dim1_mx_cuda_rceil",
+    )
 
     x = torch.randn(M, K, dtype=torch.bfloat16, device="cuda") * 1000
 
@@ -194,6 +208,42 @@ def run(
         bytes_w = (y_d1.numel() + s_d1.numel()) * bytes_per_el_fp8
         bps = (bytes_r + bytes_w) / (time_us / 1e6)
 
+    elif mode == "dim1_mx_cuda_floor":
+        bench_fn = partial(
+            mxfp8_cuda.quantize, rowwise=False, colwise=True, scaling_mode="floor"
+        )
+        _, y_d1, _, s_d1 = bench_fn(x)
+
+        for _ in range(2):
+            __ = bench_fn(x)
+
+        time_us = benchmark_cuda_function_in_microseconds(bench_fn, x)
+
+        assert y_d1.dtype == torch.float8_e4m3fn
+        assert s_d1.dtype == torch.float8_e8m0fnu
+
+        bytes_r = x.numel() * bytes_per_el_bf16
+        bytes_w = (y_d1.numel() + s_d1.numel()) * bytes_per_el_fp8
+        bps = (bytes_r + bytes_w) / (time_us / 1e6)
+
+    elif mode == "dim1_mx_cuda_rceil":
+        bench_fn = partial(
+            mxfp8_cuda.quantize, rowwise=False, colwise=True, scaling_mode="rceil"
+        )
+        _, y_d1, _, s_d1 = bench_fn(x)
+
+        for _ in range(2):
+            __ = bench_fn(x)
+
+        time_us = benchmark_cuda_function_in_microseconds(bench_fn, x)
+
+        assert y_d1.dtype == torch.float8_e4m3fn
+        assert s_d1.dtype == torch.float8_e8m0fnu
+
+        bytes_r = x.numel() * bytes_per_el_bf16
+        bytes_w = (y_d1.numel() + s_d1.numel()) * bytes_per_el_fp8
+        bps = (bytes_r + bytes_w) / (time_us / 1e6)
+
     else:
         raise AssertionError(f"unknown mode {mode}")
 
 
@@ -42,14 +42,20 @@
     triton_to_mxfp8_dim1_reference,
     unpack_uint4,
 )
-from torchao.prototype.mx_formats.mx_tensor import MXTensor
+from torchao.prototype.mx_formats.mx_tensor import MXTensor, ScaleCalculationMode
 from torchao.prototype.mx_formats.utils import to_blocked
 from torchao.utils import (
     TORCH_VERSION_AT_LEAST_2_8,
     is_sm_at_least_89,
     is_sm_at_least_100,
 )
 
+try:
+    import mxfp8_cuda
+except ImportError:
+    print("Warning: MXFP8 CUDA extension not available, some tests will be skipped")
+    pass
+
 torch.manual_seed(0)
 
 if not TORCH_VERSION_AT_LEAST_2_8:
@@ -488,3 +494,101 @@ def test_rearrange(shape):
     eager = to_blocked(scales, False)
     triton = to_blocked(scales, True)
     torch.testing.assert_close(eager, triton, atol=0, rtol=0)
+
+
+@pytest.mark.skipif(
+    not is_sm_at_least_100(),
+    reason="MXFP8 requires CUDA capability 10.0 or greater",
+)
+@pytest.mark.skipif(
+    "mxfp8_cuda" not in globals(),
+    reason="mxfp8p_cuda extnesion not available",
+)
+@pytest.mark.parametrize("M", (32, 64, 2048))
+@pytest.mark.parametrize("K", (32, 64, 2048))
+@pytest.mark.parametrize("input_dtype", (torch.float32, torch.bfloat16))
+@pytest.mark.parametrize(
+    "scaling_mode", (ScaleCalculationMode.FLOOR, ScaleCalculationMode.RCEIL)
+)
+def test_cuda_mx_dim1_numerics(M, K, input_dtype, scaling_mode):
+    scaling_mode_str = (
+        "floor" if scaling_mode == ScaleCalculationMode.FLOOR else "rceil"
+    )
+    block_size = 32
+
+    # Use disinct incrementing values from 0 to M*K-1 to make debugging easier.
+    x = (
+        torch.arange(0, M * K, dtype=input_dtype, device="cuda")
+        .reshape(M, K)
+        .contiguous()
+    )
+
+    y_d1_ref, s_d1_ref = triton_to_mxfp8_dim1_reference(
+        x, block_size=block_size, scaling_mode=scaling_mode
+    )
+    _, y_d1, _, s_d1 = mxfp8_cuda.quantize(
+        x,
+        rowwise=False,
+        colwise=True,
+        scaling_mode=scaling_mode_str,
+        scale_dim_x=1,
+        scale_dim_y=block_size,
+    )
+
+    # check scales
+    torch.testing.assert_close(s_d1, s_d1_ref, rtol=0, atol=0)
+
+    # check quantized values
+    torch.testing.assert_close(y_d1, y_d1_ref, rtol=0, atol=0)
+
+
+@pytest.mark.skipif(
+    not is_sm_at_least_100(),
+    reason="MXFP8 requires CUDA capability 10.0 or greater",
+)
+@pytest.mark.skipif(
+    "mxfp8_cuda" not in globals(),
+    reason="mxfp8p_cuda extnesion not available",
+)
+def test_cuda_mx_dim0_not_supported():
+    M, K = 64, 64
+    block_size = 32
+    x = (
+        torch.arange(0, M * K, dtype=torch.bfloat16, device="cuda")
+        .reshape(M, K)
+        .contiguous()
+    )
+    with pytest.raises(RuntimeError):
+        _, y_d1, _, s_d1 = mxfp8_cuda.quantize(
+            x,
+            rowwise=True,
+            colwise=False,
+            scale_dim_x=block_size,
+            scale_dim_y=1,
+        )
+
+
+@pytest.mark.skipif(
+    not is_sm_at_least_100(),
+    reason="MXFP8 requires CUDA capability 10.0 or greater",
+)
+@pytest.mark.skipif(
+    "mxfp8_cuda" not in globals(),
+    reason="mxfp8p_cuda extnesion not available",
+)
+def test_cuda_mx_dim1_invalid_block_size():
+    M, K = 64, 64
+    x = (
+        torch.arange(0, M * K, dtype=torch.bfloat16, device="cuda")
+        .reshape(M, K)
+        .contiguous()
+    )
+    invalid_block_size = 4
+    with pytest.raises(RuntimeError):
+        _, y_d1, _, s_d1 = mxfp8_cuda.quantize(
+            x,
+            rowwise=False,
+            colwise=True,
+            scale_dim_x=1,
+            scale_dim_y=invalid_block_size,
+        )
@@ -1375,17 +1375,23 @@ def custom_triton_to_mxfp8_dim1_sharding(x, inner_block_size=32):
         return acceptable_shardings
 
     def triton_to_mxfp8_dim1_reference(
-        x_hp: torch.Tensor, block_size
+        x_hp: torch.Tensor,
+        block_size,
+        scaling_mode=None,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         """
         A reference version of `to_mxfp8_dim1`.
         """
-        from torchao.prototype.mx_formats.mx_tensor import to_mx
+        from torchao.prototype.mx_formats.mx_tensor import ScaleCalculationMode, to_mx
+
+        scale_mode = (
+            ScaleCalculationMode.FLOOR if scaling_mode is None else scaling_mode
+        )
 
         # cast across dim1
         x_hp_d1 = x_hp.t().contiguous()
         scale_e8m0_dim1, x_hp_d1_normalized = to_mx(
-            x_hp_d1, torch.float8_e4m3fn, block_size
+            x_hp_d1, torch.float8_e4m3fn, block_size, scaling_mode=scale_mode
         )
         scale_e8m0_dim1 = scale_e8m0_dim1.view(torch.float8_e8m0fnu)
         return (
@@ -1718,7 +1724,7 @@ def triton_to_mxfp8_dim1(
         raise AssertionError("needs torch version 2.8+ and triton")
 
     def triton_to_mxfp8_dim1_reference(
-        x_hp: torch.Tensor, block_size
+        x_hp: torch.Tensor, block_size, scaling_mode
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         raise AssertionError("needs torch version 2.8+ and triton")