pytorch
diff --git a/‎benchmarks/mx_formats/cast_bench.py
Lines changed: 75 additions & 14 deletions b/‎benchmarks/mx_formats/cast_bench.py
Lines changed: 75 additions & 14 deletions
diff --git a/‎test/prototype/mx_formats/test_kernels.py
Lines changed: 120 additions & 1 deletion b/‎test/prototype/mx_formats/test_kernels.py
Lines changed: 120 additions & 1 deletion
diff --git a/‎torchao/prototype/mx_formats/kernels.py
Lines changed: 8 additions & 4 deletions b/‎torchao/prototype/mx_formats/kernels.py
Lines changed: 8 additions & 4 deletions
@@ -4,18 +4,25 @@
 # This source code is licensed under the BSD 3-Clause license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import Callable, Tuple
+from typing import Tuple
 
 import fire
 import torch
 import triton
-from torch._inductor.utils import do_bench_using_profiling
+from triton.testing import do_bench
 
 from torchao.prototype.mx_formats.kernels import (
     triton_to_mxfp8_dim1,
 )
 from torchao.prototype.mx_formats.mx_tensor import to_mx
 
+try:
+    import mxfp8_cuda
+except ImportError:
+    print(
+        "Warning: mxfp8_cuda extension not found or ready. Benchmarks using this will not be able to run."
+    )
+
 torch.manual_seed(0)
 
 bytes_per_el_bf16 = 2
@@ -64,29 +71,35 @@ def to_mx_dim1_reference(x_hp, block_size):
     return data_d1.t(), scale_d1
 
 
-def benchmark_cuda_function_in_microseconds(func: Callable, *args, **kwargs) -> float:
-    """Thin wrapper around do_bench_using_profiling"""
-    no_args = lambda: func(*args, **kwargs)
-    time = do_bench_using_profiling(no_args)
-    return time * 1e3
+def benchmark_cuda_function_in_microseconds(f, *args):
+    return do_bench(lambda: f(*args), return_mode="median") * 1e3
 
 
 def run(
     M: int = 16384,
     K: int = 16384,
     BLOCK_SIZE: int = 32,
-    mode: str = "dim0",
+    mode: str = "dim0_floor",
 ):
     print(f"M {M} K {K} BLOCK_SIZE {BLOCK_SIZE}")
     print(f"GPU: {torch.cuda.get_device_name(0)}")
     print(f"torch version: {torch.__version__}")
     print(f"triton version: {triton.__version__}")
     print(f"mode: {mode}")
-    assert mode in ("dim0", "dim1", "dim0_dim1", "dim0_mx", "dim1_mx", "dim1_mx_triton")
+    assert mode in (
+        "dim0_floor",
+        "dim1_floor",
+        "dim0_dim1_floor",
+        "dim0_mx_floor",
+        "dim1_mx_floor",
+        "dim1_mx_triton",
+        "dim1_mx_cuda_floor",
+        "dim1_mx_cuda_rceil",
+    )
 
     x = torch.randn(M, K, dtype=torch.bfloat16, device="cuda") * 1000
 
-    if mode == "dim0":
+    if mode == "dim0_floor":
         scale_dim0_reference_c = torch.compile(scale_dim0_reference)
         y_d0, s_d0 = scale_dim0_reference_c(x, BLOCK_SIZE)
 
@@ -103,7 +116,7 @@ def run(
         bytes_rw = sum(t.numel() for t in [x, y_d0, s_d0]) * bytes_per_el_bf16
         bps = bytes_rw / (time_us / 1e6)
 
-    elif mode == "dim1":
+    elif mode == "dim1_floor":
         scale_dim1_reference_c = torch.compile(scale_dim1_reference)
         y_d1, s_d1 = scale_dim1_reference_c(x, BLOCK_SIZE)
 
@@ -120,7 +133,7 @@ def run(
         bytes_rw = sum(t.numel() for t in [x, y_d1, s_d1]) * bytes_per_el_bf16
         bps = bytes_rw / (time_us / 1e6)
 
-    elif mode == "dim0_dim1":
+    elif mode == "dim0_dim1_floor":
         scale_dim0_dim1_reference_c = torch.compile(scale_dim0_dim1_reference)
         y_d0, y_d1, s_d0, s_d1 = scale_dim0_dim1_reference_c(x, BLOCK_SIZE)
 
@@ -141,7 +154,7 @@ def run(
         )
         bps = bytes_rw / (time_us / 1e6)
 
-    elif mode == "dim0_mx":
+    elif mode == "dim0_mx_floor":
         to_mx_dim0_reference_c = torch.compile(to_mx_dim0_reference)
         y_d0, s_d0 = to_mx_dim0_reference_c(x, BLOCK_SIZE)
 
@@ -159,7 +172,7 @@ def run(
         bytes_w = (y_d0.numel() + s_d0.numel()) * bytes_per_el_fp8
         bps = (bytes_r + bytes_w) / (time_us / 1e6)
 
-    elif mode == "dim1_mx":
+    elif mode == "dim1_mx_floor":
         to_mx_dim1_reference_c = torch.compile(to_mx_dim1_reference)
         y_d1, s_d1 = to_mx_dim1_reference_c(x, BLOCK_SIZE)
 
@@ -194,6 +207,54 @@ def run(
         bytes_w = (y_d1.numel() + s_d1.numel()) * bytes_per_el_fp8
         bps = (bytes_r + bytes_w) / (time_us / 1e6)
 
+    elif mode == "dim1_mx_cuda_floor":
+        _, y_d1, _, s_d1 = mxfp8_cuda.quantize(
+            x, rowwise=False, colwise=True, scaling_mode="floor"
+        )
+
+        for _ in range(2):
+            __ = mxfp8_cuda.quantize(
+                x, rowwise=False, colwise=True, scaling_mode="floor"
+            )
+
+        time_us = benchmark_cuda_function_in_microseconds(
+            lambda x: mxfp8_cuda.quantize(
+                x, rowwise=False, colwise=True, scaling_mode="floor"
+            ),
+            x,
+        )
+
+        assert y_d1.dtype == torch.float8_e4m3fn
+        assert s_d1.dtype == torch.float8_e8m0fnu
+
+        bytes_r = x.numel() * bytes_per_el_bf16
+        bytes_w = (y_d1.numel() + s_d1.numel()) * bytes_per_el_fp8
+        bps = (bytes_r + bytes_w) / (time_us / 1e6)
+
+    elif mode == "dim1_mx_cuda_rceil":
+        _, y_d1, _, s_d1 = mxfp8_cuda.quantize(
+            x, rowwise=False, colwise=True, scaling_mode="rceil"
+        )
+
+        for _ in range(2):
+            __ = mxfp8_cuda.quantize(
+                x, rowwise=False, colwise=True, scaling_mode="rceil"
+            )
+
+        time_us = benchmark_cuda_function_in_microseconds(
+            lambda x: mxfp8_cuda.quantize(
+                x, rowwise=False, colwise=True, scaling_mode="rceil"
+            ),
+            x,
+        )
+
+        assert y_d1.dtype == torch.float8_e4m3fn
+        assert s_d1.dtype == torch.float8_e8m0fnu
+
+        bytes_r = x.numel() * bytes_per_el_bf16
+        bytes_w = (y_d1.numel() + s_d1.numel()) * bytes_per_el_fp8
+        bps = (bytes_r + bytes_w) / (time_us / 1e6)
+
     else:
         raise AssertionError(f"unknown mode {mode}")
 
 
@@ -42,20 +42,37 @@
     triton_to_mxfp8_dim1_reference,
     unpack_uint4,
 )
-from torchao.prototype.mx_formats.mx_tensor import MXTensor
+from torchao.prototype.mx_formats.mx_tensor import MXTensor, ScaleCalculationMode, to_mx
 from torchao.prototype.mx_formats.utils import to_blocked
 from torchao.utils import (
     TORCH_VERSION_AT_LEAST_2_8,
     is_sm_at_least_89,
     is_sm_at_least_100,
 )
 
+try:
+    import mxfp8_cuda
+except ImportError:
+    print("Warning: MXFP8 CUDA extension not available, some tests will be skipped")
+    pass
+
 torch.manual_seed(0)
 
 if not TORCH_VERSION_AT_LEAST_2_8:
     pytest.skip("Unsupported PyTorch version", allow_module_level=True)
 
 
+# TODO: shared utils file for benchmarking and testing
+def to_mx_dim1_reference(x_hp, block_size, scaling_mode):
+    x_hp = x_hp.t().contiguous()
+    scale_d1, data_d1 = to_mx(
+        x_hp, torch.float8_e4m3fn, block_size, scaling_mode=scaling_mode
+    )
+    return data_d1.t(), scale_d1.squeeze(
+        -1
+    )  # torchao impl returns extra empty dim that triton / cuda do not
+
+
 @pytest.mark.skip(
     reason="TODO debug CI failure, low pri since this is not used in the MX code"  # noqa: E501
 )
@@ -488,3 +505,105 @@ def test_rearrange(shape):
     eager = to_blocked(scales, False)
     triton = to_blocked(scales, True)
     torch.testing.assert_close(eager, triton, atol=0, rtol=0)
+
+
+@pytest.mark.skipif(
+    not is_sm_at_least_100(),
+    reason="MXFP8 requires CUDA capability 10.0 or greater",
+)
+@pytest.mark.skipif(
+    "mxfp8_cuda" not in globals(),
+    reason="mxfp8p_cuda extnesion not available",
+)
+@pytest.mark.parametrize("M", (32, 64, 2048))
+@pytest.mark.parametrize("K", (32, 64, 2048))
+@pytest.mark.parametrize("input_dtype", (torch.float32, torch.bfloat16))
+@pytest.mark.parametrize(
+    "scaling_mode", (ScaleCalculationMode.FLOOR, ScaleCalculationMode.RCEIL)
+)
+def test_cuda_mx_dim1_numerics(M, K, input_dtype, scaling_mode):
+    scaling_mode_str = (
+        "floor" if scaling_mode == ScaleCalculationMode.FLOOR else "rceil"
+    )
+    block_size = 32
+
+    # Use disinct incrementing values from 0 to M*K-1 to make debugging easier.
+    x = (
+        torch.arange(0, M * K, dtype=input_dtype, device="cuda")
+        .reshape(M, K)
+        .contiguous()
+    )
+
+    y_d1_ref, s_d1_ref = to_mx_dim1_reference(
+        x,
+        block_size=block_size,
+        scaling_mode=scaling_mode,
+    )
+
+    _, y_d1, _, s_d1 = mxfp8_cuda.quantize(
+        x,
+        rowwise=False,
+        colwise=True,
+        scaling_mode=scaling_mode_str,
+        scale_dim_x=1,
+        scale_dim_y=block_size,
+    )
+
+    # check scales
+    torch.testing.assert_close(s_d1, s_d1_ref, rtol=0, atol=0)
+
+    # check quantized values
+    torch.testing.assert_close(y_d1, y_d1_ref, rtol=0, atol=0)
+    assert y_d1.stride() == y_d1_ref.stride(), "quantized tensor strides do not match"
+
+
+@pytest.mark.skipif(
+    not is_sm_at_least_100(),
+    reason="MXFP8 requires CUDA capability 10.0 or greater",
+)
+@pytest.mark.skipif(
+    "mxfp8_cuda" not in globals(),
+    reason="mxfp8p_cuda extnesion not available",
+)
+def test_cuda_mx_dim0_not_supported():
+    M, K = 64, 64
+    block_size = 32
+    x = (
+        torch.arange(0, M * K, dtype=torch.bfloat16, device="cuda")
+        .reshape(M, K)
+        .contiguous()
+    )
+    with pytest.raises(RuntimeError):
+        _, y_d1, _, s_d1 = mxfp8_cuda.quantize(
+            x,
+            rowwise=True,
+            colwise=False,
+            scale_dim_x=block_size,
+            scale_dim_y=1,
+        )
+
+
+@pytest.mark.skipif(
+    not is_sm_at_least_100(),
+    reason="MXFP8 requires CUDA capability 10.0 or greater",
+)
+@pytest.mark.skipif(
+    "mxfp8_cuda" not in globals(),
+    reason="mxfp8p_cuda extnesion not available",
+)
+def test_cuda_mx_dim1_invalid_block_size():
+    M, K = 64, 64
+    x = (
+        torch.arange(0, M * K, dtype=torch.bfloat16, device="cuda")
+        .reshape(M, K)
+        .contiguous()
+    )
+    invalid_block_size = 4
+    with pytest.raises(RuntimeError):
+        _, y_d1, _, s_d1 = mxfp8_cuda.quantize(
+            x,
+            rowwise=False,
+            colwise=True,
+            scale_dim_x=1,
+            scale_dim_y=invalid_block_size,
+        )
@@ -1375,17 +1375,21 @@ def custom_triton_to_mxfp8_dim1_sharding(x, inner_block_size=32):
         return acceptable_shardings
 
     def triton_to_mxfp8_dim1_reference(
-        x_hp: torch.Tensor, block_size
+        x_hp: torch.Tensor,
+        block_size,
+        scaling_mode="FLOOR",
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         """
         A reference version of `to_mxfp8_dim1`.
         """
-        from torchao.prototype.mx_formats.mx_tensor import to_mx
+        from torchao.prototype.mx_formats.mx_tensor import ScaleCalculationMode, to_mx
+
+        scale_mode = ScaleCalculationMode[scaling_mode]
 
         # cast across dim1
         x_hp_d1 = x_hp.t().contiguous()
         scale_e8m0_dim1, x_hp_d1_normalized = to_mx(
-            x_hp_d1, torch.float8_e4m3fn, block_size
+            x_hp_d1, torch.float8_e4m3fn, block_size, scaling_mode=scale_mode
         )
         scale_e8m0_dim1 = scale_e8m0_dim1.view(torch.float8_e8m0fnu)
         return (
@@ -1718,7 +1722,7 @@ def triton_to_mxfp8_dim1(
         raise AssertionError("needs torch version 2.8+ and triton")
 
     def triton_to_mxfp8_dim1_reference(
-        x_hp: torch.Tensor, block_size
+        x_hp: torch.Tensor, block_size, scaling_mode
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         raise AssertionError("needs torch version 2.8+ and triton")