pytorch
diff --git a/‎benchmarks/float8/bench_matmul.py
Lines changed: 17 additions & 43 deletions b/‎benchmarks/float8/bench_matmul.py
Lines changed: 17 additions & 43 deletions
diff --git a/‎benchmarks/float8/utils.py
Lines changed: 6 additions & 3 deletions b/‎benchmarks/float8/utils.py
Lines changed: 6 additions & 3 deletions
diff --git a/‎setup.py
Lines changed: 25 additions & 56 deletions b/‎setup.py
Lines changed: 25 additions & 56 deletions
diff --git a/‎test/prototype/mx_formats/test_mx_mm.py
Lines changed: 2 additions & 3 deletions b/‎test/prototype/mx_formats/test_mx_mm.py
Lines changed: 2 additions & 3 deletions
diff --git a/‎torchao/__init__.py
Lines changed: 0 additions & 13 deletions b/‎torchao/__init__.py
Lines changed: 0 additions & 13 deletions
diff --git a/‎torchao/csrc/cuda/mx_kernels/mx_fp_cutlass_kernels_sm100a.cu renamed to ‎torchao/csrc/cuda/mx_kernels/mx_fp_cutlass_kernels.cu b/‎torchao/csrc/cuda/mx_kernels/mx_fp_cutlass_kernels_sm100a.cu renamed to ‎torchao/csrc/cuda/mx_kernels/mx_fp_cutlass_kernels.cu
@@ -16,8 +16,6 @@
     get_name_to_shapes_iter,
 )
 
-from torchao.ops import mx_fp4_bf16
-from torchao.prototype.mx_formats.mx_tensor import to_mx
 from torchao.testing.float8.roofline_utils import get_specs
 
 
@@ -64,19 +62,13 @@ def run(
 ):
     device = "cuda"
     # TODO(future PR): this is ugly
-    assert recipe in ("tensorwise", "rowwise", "mxfp8_cublas", "mxfp4_cutlass"), (
-        "unsupported"
-    )
-    use_fp4 = recipe == "mxfp4_cutlass"
+    assert recipe in ("tensorwise", "rowwise", "mxfp8_cublas"), "unsupported"
 
     specs = get_specs()
     bf16_peak_tops = specs["bf16_peak_tops"]
     fp8_peak_tops = specs["fp8_peak_tops"]
-    fp4_peak_tops = specs["fp4_peak_tops"]
     print(f"gpu_name: {torch.cuda.get_device_name(0)}")
-    print(
-        f"peak tops: bf16 {bf16_peak_tops:.2e}, fp8 {fp8_peak_tops:.2e}, fp4 {fp4_peak_tops:.2e}"
-    )
+    print(f"peak tops: bf16 {bf16_peak_tops:.2e}, fp8 {fp8_peak_tops:.2e}")
 
     headers = (
         "fast_accum",
@@ -85,14 +77,14 @@ def run(
         "K",
         "N",
         "ref_time_s",
-        "time_s",
-        "speedup",
+        "fp8_time_s",
+        "fp8_speedup",
     )
     results = []
 
     dtype = torch.bfloat16
     name_to_shapes = get_name_to_shapes_iter(shape_gen_name, M, K, N)
-    fast_accum_vals = [False] if use_fp4 else [True, False]
+    fast_accum_vals = [True, False]
 
     for idx, (fast_accum, (name, (M, K, N))) in enumerate(
         itertools.product(fast_accum_vals, name_to_shapes)
@@ -115,53 +107,35 @@ def run(
 
         del A
 
-        A_hp = torch.randn(M, K, device=device)
-        B_hp_t = torch.randn(N, K, device=device)
-
-        if use_fp4:
-            _, A = to_mx(A_hp, torch.float4_e2m1fn_x2, 32)
-            _, Bt = to_mx(B_hp_t, torch.float4_e2m1fn_x2, 32)
-            B = Bt.contiguous().T
-            peak_tops = fp4_peak_tops
-        else:
-            # raw float8 matmul (upper bound for what we can achive in eager mode)
-            # TODO(future): add e5m2
-            d1, d2, d3 = torch.float8_e4m3fn, torch.float8_e4m3fn, dtype
-            A = A_hp.to(d1)
-            B = B_hp_t.to(d2).contiguous().T
-            peak_tops = fp8_peak_tops
-
+        # raw float8 matmul (upper bound for what we can achive in eager mode)
+        # TODO(future): add e5m2
+        d1, d2, d3 = torch.float8_e4m3fn, torch.float8_e4m3fn, dtype
+        A = torch.zeros(M, K, device=device, dtype=d1)
+        B = torch.zeros(K, N, device=device, dtype=d2).t().contiguous().t()
         if recipe == "tensorwise":
             scale_a = torch.tensor([1.0], device=device)
             scale_b = torch.tensor([1.0], device=device)
         elif recipe == "rowwise":
             scale_a = torch.ones(M, 1, device=device)
             scale_b = torch.ones(1, N, device=device)
-        elif recipe in ("mxfp8_cublas", "mxfp4_cutlass"):
+        elif recipe == "mxfp8_cublas":
             scale_a = torch.ones(M, K // 32, device=device, dtype=torch.float8_e8m0fnu)
             scale_b = torch.ones(N, K // 32, device=device, dtype=torch.float8_e8m0fnu)
         else:
             assert False, f"unknown recipe {recipe}"
 
-        def do_matmul_fp8(A, B):
+        def do_matmul(A, B):
             nonlocal scale_a
             nonlocal scale_b
             return torch._scaled_mm(
                 A, B, scale_a, scale_b, out_dtype=d3, use_fast_accum=fast_accum
             )
 
-        def do_matmul_mxfp4(A, B):
-            nonlocal scale_a
-            nonlocal scale_b
-            return mx_fp4_bf16(A, B, scale_a, scale_b)
-
-        do_matmul = do_matmul_mxfp4 if use_fp4 else do_matmul_fp8
-
-        time_sec, tops_sec, pct_top_peak = do_benchmarks(
-            tops, peak_tops, use_gpu_kernel_time, do_matmul, A, B
+        fp8_time_sec, fp8_tops_sec, fp8_pct_top_peak = do_benchmarks(
+            tops, fp8_peak_tops, use_gpu_kernel_time, do_matmul, A, B
         )
         print(
-            f"time_sec {time_sec:.2E}, tops/sec {tops_sec:.2E}, pct_peak {pct_top_peak:.3f}"
+            f"fp8 time_sec {fp8_time_sec:.2E}, tops/sec {fp8_tops_sec:.2E}, pct_peak {fp8_pct_top_peak:.3f}"
         )
 
         del A, B, scale_a, scale_b
@@ -174,8 +148,8 @@ def do_matmul_mxfp4(A, B):
                 K,
                 N,
                 ref_time_sec,
-                time_sec,
-                ref_time_sec / time_sec,
+                fp8_time_sec,
+                ref_time_sec / fp8_time_sec,
             ]
         )
 
 
@@ -352,6 +352,9 @@ def get_gpu_kernel_gemm_time_s(f, *args, **kwargs):
     )
     # there is only 1 key, aten::mm or aten::_scaled_mm, with unit nanoseconds
     assert len(data) == 1
-    key, value = next(iter(data.items()))
-    assert key in ("aten::mm", "aten::_scaled_mm", "torchao::mx_fp4_bf16")
-    return value / 1e6 / n_iter
+    if "aten::mm" in data:
+        return data["aten::mm"] / 1e6 / n_iter
+    elif "aten::_scaled_mm" in data:
+        return data["aten::_scaled_mm"] / 1e6 / n_iter
+    else:
+        raise AssertionError("unexpected format of data")
@@ -272,18 +272,15 @@ def get_cutlass_build_flags():
             raise ValueError("No CUDA version found")
 
         major, minor = map(int, cuda_version.split(".")[:2])
-        build_sm90a = (major, minor) >= (12, 6)
-        build_sm100a = (major, minor) >= (12, 8)
-        build_sm120a = (major, minor) >= (12, 8)
+        build_sm90a = major > 12 or (major == 12 and minor >= 6)
+        build_sm100a = major > 12 or (major == 12 and minor >= 8)
 
         if build_sm90a:
             print(f"CUDA {cuda_version}: Enabling SM90a CUTLASS kernels")
         if build_sm100a:
             print(f"CUDA {cuda_version}: Enabling SM100a CUTLASS kernels")
-        if build_sm120a:
-            print(f"CUDA {cuda_version}: Enabling SM120a CUTLASS kernels")
 
-        return build_sm90a, build_sm100a, build_sm120a
+        return build_sm90a, build_sm100a
     except:
         # Fallback to architecture flags
         cuda_arch_flags = _get_cuda_arch_flags()
@@ -343,11 +340,6 @@ def __init__(
         self.cmake_args = cmake_args
 
 
-def remove_items(a: list, b: list) -> list:
-    """Remove items in list b from list a"""
-    return [x for x in a if x not in b]
-
-
 def get_extensions():
     # Skip building C++ extensions if USE_CPP is set to "0"
     if use_cpp == "0":
@@ -462,7 +454,7 @@ def get_extensions():
         excluded_sources = list(
             glob.glob(os.path.join(extensions_dir, "cpu/*.cpp"), recursive=True)
         )
-        sources = remove_items(sources, excluded_sources)
+        sources = [s for s in sources if s not in excluded_sources]
 
     # Collect CUDA source files
     extensions_cuda_dir = os.path.join(extensions_dir, "cuda")
@@ -506,24 +498,22 @@ def get_extensions():
         rocm_sources = list(
             glob.glob(os.path.join(extensions_rocm_dir, "**/*.cpp"), recursive=True)
         )
-        sources = remove_items(sources, rocm_sources)
+        sources = [s for s in sources if s not in rocm_sources]
 
-    use_cutlass = use_cuda and not IS_WINDOWS
+    use_cutlass = False
     cutlass_90a_sources = None
     cutlass_100a_sources = None
-    cutlass_120a_sources = None
     build_for_sm90a = False
     build_for_sm100a = False
-    build_for_sm120a = False
-
-    if use_cutlass:
+    if use_cuda and not IS_WINDOWS:
+        use_cutlass = True
         cutlass_dir = os.path.join(third_party_path, "cutlass")
         cutlass_include_dir = os.path.join(cutlass_dir, "include")
         cutlass_tools_include_dir = os.path.join(
             cutlass_dir, "tools", "util", "include"
         )
         cutlass_extensions_include_dir = os.path.join(cwd, extensions_cuda_dir)
-
+    if use_cutlass:
         extra_compile_args["nvcc"].extend(
             [
                 "-DTORCHAO_USE_CUTLASS",
@@ -543,7 +533,7 @@ def get_extensions():
             ]
         )
 
-        build_for_sm90a, build_for_sm100a, build_for_sm120a = get_cutlass_build_flags()
+        build_for_sm90a, build_for_sm100a = get_cutlass_build_flags()
         # Define sm90a sources
         cutlass_90a_sources = [
             os.path.join(
@@ -567,40 +557,40 @@ def get_extensions():
                     "rowwise_scaled_linear_sparse_cutlass_" + dtypes + ".cu",
                 )
             )
-        sources = remove_items(sources, cutlass_90a_sources)
+        # Always remove sm90a sources from main sources
+        sources = [s for s in sources if s not in cutlass_90a_sources]
 
         # Always compile mx_fp_cutlass_kernels.cu ONLY with sm100a architecture
         cutlass_100a_sources = [
             os.path.join(
                 extensions_cuda_dir,
                 "mx_kernels",
-                "mx_fp_cutlass_kernels_sm100a.cu",
+                "mx_fp_cutlass_kernels.cu",
             ),
         ]
-        sources = remove_items(sources, cutlass_100a_sources)
-
-        # Always compile mx_fp_cutlass_kernels.cu ONLY with sm120a architecture
-        cutlass_120a_sources = [
-            os.path.join(
-                extensions_cuda_dir,
-                "mx_kernels",
-                "mx_fp_cutlass_kernels_sm120a.cu",
-            ),
+        # Remove from main sources to prevent compilation with other architectures
+        sources = [
+            s for s in sources if os.path.basename(s) != "mx_fp_cutlass_kernels.cu"
         ]
-        sources = remove_items(sources, cutlass_120a_sources)
 
     else:
-        # Remove CUTLASS-based kernels from the sources list.  An assumption is that
-        # these files will have "cutlass" in its name.
+        # Remove CUTLASS-based kernels from the sources list.  An
+        # assumption is that these files will have "cutlass" in its
+        # name.
         cutlass_sources = list(
             glob.glob(
                 os.path.join(extensions_cuda_dir, "**/*cutlass*.cu"), recursive=True
             )
         )
-        sources = remove_items(sources, cutlass_sources)
+        sources = [s for s in sources if s not in cutlass_sources]
 
     ext_modules = []
     if len(sources) > 0:
+        # Double-check to ensure mx_fp_cutlass_kernels.cu is not in sources
+        sources = [
+            s for s in sources if os.path.basename(s) != "mx_fp_cutlass_kernels.cu"
+        ]
+
         ext_modules.append(
             extension(
                 "torchao._C",
@@ -653,27 +643,6 @@ def get_extensions():
             )
         )
 
-    # Only build the cutlass_120a extension if sm120a is in the architecture flags
-    if (
-        cutlass_120a_sources is not None
-        and len(cutlass_120a_sources) > 0
-        and build_for_sm120a
-    ):
-        cutlass_120a_extra_compile_args = copy.deepcopy(extra_compile_args)
-        # Only use sm120a architecture for these sources, ignoring cuda_arch_flags
-        cutlass_120a_extra_compile_args["nvcc"].append(
-            "-gencode=arch=compute_120a,code=sm_120a"
-        )
-        ext_modules.append(
-            extension(
-                "torchao._C_cutlass_120a",
-                cutlass_120a_sources,
-                py_limited_api=True,
-                extra_compile_args=cutlass_120a_extra_compile_args,
-                extra_link_args=extra_link_args,
-            )
-        )
-
     # Build CMakeLists from /torchao/experimental - additional options become available : TORCHAO_BUILD_CPU_AARCH64, TORCHAO_BUILD_KLEIDIAI, TORCHAO_BUILD_MPS_OPS, TORCHAO_PARALLEL_BACKEND
     if build_macos_arm_auto or os.getenv("BUILD_TORCHAO_EXPERIMENTAL") == "1":
         build_options = BuildOptions()
 
@@ -14,7 +14,7 @@
 from torchao.prototype.mx_formats.utils import to_blocked
 from torchao.utils import (
     TORCH_VERSION_AT_LEAST_2_8,
-    is_sm_version,
+    is_sm_at_least_100,
 )
 
 if not TORCH_VERSION_AT_LEAST_2_8:
@@ -59,8 +59,7 @@ def run_matrix_test(M: int, K: int, N: int, format) -> float:
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
 @pytest.mark.skipif(
-    not (is_sm_version(10, 0) or is_sm_version(12, 0)),
-    reason="CUDA capability 10.0 or 12.0 is required for mxfloat8",
+    not is_sm_at_least_100(), reason="CUDA capability >= 10.0 required for mxfloat8"
 )
 @pytest.mark.parametrize(
     "size",
 
@@ -25,21 +25,8 @@
 
     so_files = list(Path(__file__).parent.glob("_C*.so"))
     if len(so_files) > 0:
-        compute_capability = (
-            torch.cuda.get_device_capability() if torch.cuda.is_available() else None
-        )
-
         for file in so_files:
-            # only load architecture-specific target if the current GPU matches that target
-            if (
-                ("cutlass_90a" in file.name and compute_capability != (9, 0))
-                or ("cutlass_100a" in file.name and compute_capability != (10, 0))
-                or ("cutlass_120a" in file.name and compute_capability != (12, 0))
-            ):
-                continue
-
             torch.ops.load_library(str(file))
-
         from . import ops
 
     # The following library contains CPU kernels from torchao/experimental