pytorch
diff --git a/‎benchmarks/benchmark_rowwise_scaled_linear_cutlass.py
Lines changed: 35 additions & 21 deletions b/‎benchmarks/benchmark_rowwise_scaled_linear_cutlass.py
Lines changed: 35 additions & 21 deletions
diff --git a/‎benchmarks/benchmark_rowwise_scaled_linear_sparse_cutlass.py
Lines changed: 72 additions & 0 deletions b/‎benchmarks/benchmark_rowwise_scaled_linear_sparse_cutlass.py
Lines changed: 72 additions & 0 deletions
diff --git a/‎docs/source/api_ref_dtypes.rst
Lines changed: 1 addition & 0 deletions b/‎docs/source/api_ref_dtypes.rst
Lines changed: 1 addition & 0 deletions
diff --git a/‎setup.py
Lines changed: 89 additions & 30 deletions b/‎setup.py
Lines changed: 89 additions & 30 deletions
@@ -7,41 +7,55 @@
     rowwise_scaled_linear_cutlass_s4s4,
     rowwise_scaled_linear_cutlass_s8s4,
 )
+from torchao.quantization.quant_api import (
+    _int4_symm_cutlass_quant,
+    _int8_symm_cutlass_quant,
+)
+
+dtype = torch.bfloat16
+dtypeq = torch.int8
+dtype_scale = torch.float32
+device = torch.device("cuda")
 
 
 def benchmark_microseconds(f, *args):
     return do_bench(lambda: f(*args), return_mode="median") * 1e3
 
 
-def get_problem(m: int, n: int, k: int, A_nbits: int, B_nbits: int):
-    assert A_nbits in (4, 8) and B_nbits in (4, 8)
+def get_problem(m: int, n: int, k: int, Xq_nbits: int):
+    assert k % 2 == 0
+    assert Xq_nbits in [4, 8]
+
+    X_ref = torch.randn((m, k), dtype=dtype, device=device)
+    W_ref = torch.rand((n, k), dtype=dtype, device=device)
 
-    dev = torch.device("cuda")
-    A = torch.randint(-128, 127, (m, k * A_nbits // 8), dtype=torch.int8, device=dev)
-    A_scale = torch.randn((m,), dtype=torch.half, device=dev)
-    B = torch.randint(
-        -128, 127, size=(n, k * B_nbits // 8), dtype=torch.int8, device=dev
+    X_quant_func = (
+        _int4_symm_cutlass_quant if Xq_nbits == 4 else _int8_symm_cutlass_quant
     )
-    B_scale = torch.randn((n,), dtype=torch.half, device=dev)
-    C = None
+    W_quant_func = _int4_symm_cutlass_quant
+    X_aqt = X_quant_func(X_ref)
+    W_aqt = W_quant_func(W_ref)
 
-    return A, A_scale, B, B_scale, C
+    Xq = X_aqt.tensor_impl.int_data
+    X_scale = X_aqt.tensor_impl.scale
+    Wq = W_aqt.tensor_impl.int_data
+    W_scale = W_aqt.tensor_impl.scale
+    bias = None
+    out_dtype = dtype
 
+    return (X_ref, W_ref), (Xq, X_scale, Wq, W_scale, bias, out_dtype)
 
-def benchmark(m: int, k: int, n: int):
-    dev = torch.device("cuda")
-    A_ref = torch.randn((m, k), dtype=torch.half, device=dev)
-    B_ref = torch.randn((n, k), dtype=torch.half, device=dev)
-    fp16_time = benchmark_microseconds(torch.nn.functional.linear, A_ref, B_ref)
 
-    A, A_scale, B, B_scale, C = get_problem(m, n, k, 8, 4)
-    rowwise_scaled_linear_cutlass_s8s4_time = benchmark_microseconds(
-        rowwise_scaled_linear_cutlass_s8s4, A, A_scale, B, B_scale, C
+def benchmark(m: int, k: int, n: int):
+    ref_args, args = get_problem(m, n, k, 4)
+    fp16_time = benchmark_microseconds(torch.nn.functional.linear, *ref_args)
+    rowwise_scaled_linear_cutlass_s4s4_time = benchmark_microseconds(
+        rowwise_scaled_linear_cutlass_s4s4, *args
     )
 
-    A, A_scale, B, B_scale, C = get_problem(m, n, k, 4, 4)
-    rowwise_scaled_linear_cutlass_s4s4_time = benchmark_microseconds(
-        rowwise_scaled_linear_cutlass_s4s4, A, A_scale, B, B_scale, C
+    _, args = get_problem(m, n, k, 8)
+    rowwise_scaled_linear_cutlass_s8s4_time = benchmark_microseconds(
+        rowwise_scaled_linear_cutlass_s8s4, *args
     )
 
     return {
 
@@ -0,0 +1,72 @@
+import pandas as pd
+import torch
+from tqdm import tqdm
+from triton.testing import do_bench
+
+from torchao.ops import rowwise_scaled_linear_sparse_cutlass_f8f8
+from torchao.quantization.quant_api import (
+    _float8_cutlass_quant,
+    _float8_cutlass_quant_sparse,
+)
+from torchao.sparsity.utils import create_semi_structured_tensor
+
+dtype = torch.bfloat16
+dtypeq_X = torch.float8_e5m2
+dtypeq_W = torch.float8_e4m3fn
+device = torch.device("cuda")
+
+
+def benchmark_microseconds(f, *args):
+    return do_bench(lambda: f(*args), return_mode="median") * 1e3
+
+
+def get_problem(m: int, n: int, k: int):
+    X_ref = torch.randn((m, k), dtype=dtype, device=device)
+    W_ref = create_semi_structured_tensor(n, k, dtype=dtype).to(device)
+
+    X_quant_func = _float8_cutlass_quant
+    W_quant_func = _float8_cutlass_quant_sparse
+    X_aqt = X_quant_func(X_ref, dtypeq_X)
+    W_aqt = W_quant_func(W_ref, dtypeq_W)
+
+    Xq = X_aqt.tensor_impl.float8_data
+    X_scale = X_aqt.tensor_impl.scale
+    Wq_sparse = W_aqt.tensor_impl.sparse
+    W_meta = W_aqt.tensor_impl.meta
+    W_scale = W_aqt.tensor_impl.scale
+    bias = None
+    out_dtype = dtype
+
+    return (X_ref, W_ref), (Xq, X_scale, Wq_sparse, W_meta, W_scale, bias, out_dtype)
+
+
+def benchmark(m: int, k: int, n: int):
+    ref_args, args = get_problem(m, n, k)
+    fp16_time = benchmark_microseconds(torch.nn.functional.linear, *ref_args)
+    rowwise_scaled_linear_sparse_cutlass_f8f8_time = benchmark_microseconds(
+        rowwise_scaled_linear_sparse_cutlass_f8f8, *args
+    )
+
+    return {
+        "m": m,
+        "k": k,
+        "n": n,
+        "fp16_latency (ms)": fp16_time,
+        "rowwise_scaled_linear_sparse_cutlass_f8f8 latency (ms)": rowwise_scaled_linear_sparse_cutlass_f8f8_time,
+        "f8f8 speedup (d/s)": fp16_time
+        / rowwise_scaled_linear_sparse_cutlass_f8f8_time,
+    }
+
+
+if __name__ == "__main__":
+    k_vals = (8192, 8192, 8192, 28672)
+    n_vals = (8192, 10240, 57344, 8192)
+
+    results = []
+    for m in tqdm([1 << i for i in range(10)]):
+        for n, k in zip(n_vals, k_vals):
+            results.append(benchmark(m, k, n))
+
+    df = pd.DataFrame(results)
+    df.to_csv("rowwise_scaled_linear_sparse_cutlass_time_results.csv", index=False)
+    print(df.to_markdown(index=False))
@@ -28,6 +28,7 @@ Layouts and Tensor Subclasses
     MarlinQQQLayout
     Int4CPULayout
     CutlassInt4PackedLayout
+    CutlassSemiSparseLayout
 
 Quantization techniques
 -----------------------
 
@@ -3,6 +3,7 @@
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
 
+import copy
 import glob
 import os
 import subprocess
@@ -75,6 +76,7 @@ def use_debug_mode():
     BuildExtension,
     CppExtension,
     CUDAExtension,
+    _get_cuda_arch_flags,
 )
 
 IS_ROCM = (torch.version.hip is not None) and (ROCM_HOME is not None)
@@ -269,7 +271,12 @@ def get_extensions():
     extra_link_args = []
     extra_compile_args = {
         "cxx": [f"-DPy_LIMITED_API={PY3_9_HEXCODE}"],
-        "nvcc": ["-O3" if not debug_mode else "-O0", "-t=0", "-std=c++17"],
+        "nvcc": [
+            "-DNDEBUG" if not debug_mode else "-DDEBUG",
+            "-O3" if not debug_mode else "-O0",
+            "-t=0",
+            "-std=c++17",
+        ],
     }
 
     if not IS_WINDOWS:
@@ -304,25 +311,6 @@ def get_extensions():
     if use_cuda:
         sources += cuda_sources
 
-    use_cutlass = False
-    if use_cuda and not IS_WINDOWS:
-        use_cutlass = True
-        cutlass_dir = os.path.join(third_party_path, "cutlass")
-        cutlass_include_dir = os.path.join(cutlass_dir, "include")
-        cutlass_tools_include_dir = os.path.join(
-            cutlass_dir, "tools", "util", "include"
-        )
-        cutlass_extensions_include_dir = os.path.join(cwd, extensions_cuda_dir)
-    if use_cutlass:
-        extra_compile_args["nvcc"].extend(
-            [
-                "-DTORCHAO_USE_CUTLASS",
-                "-I" + cutlass_include_dir,
-                "-I" + cutlass_tools_include_dir,
-                "-I" + cutlass_extensions_include_dir,
-            ]
-        )
-
     # Get base directory and source paths
     curdir = os.path.dirname(os.path.curdir)
     extensions_dir = os.path.join(curdir, "torchao", "csrc")
@@ -349,16 +337,6 @@ def get_extensions():
     # Collect CUDA source files if needed
     if not IS_ROCM and use_cuda:
         sources += cuda_sources
-    else:
-        # Remove CUTLASS-based kernels from the cuda_sources list.  An
-        # assumption is that these files will have "cutlass" in its
-        # name.
-        cutlass_sources = list(
-            glob.glob(
-                os.path.join(extensions_cuda_dir, "**/*cutlass*.cu"), recursive=True
-            )
-        )
-        sources = [s for s in sources if s not in cutlass_sources]
 
     # TOOD: Remove this and use what CUDA has once we fix all the builds.
     if IS_ROCM and use_cuda:
@@ -372,6 +350,72 @@ def get_extensions():
         else:
             sources += hip_sources
 
+    use_cutlass = False
+    cutlass_90a_sources = None
+    if use_cuda and not IS_ROCM and not IS_WINDOWS:
+        use_cutlass = True
+        cutlass_dir = os.path.join(third_party_path, "cutlass")
+        cutlass_include_dir = os.path.join(cutlass_dir, "include")
+        cutlass_tools_include_dir = os.path.join(
+            cutlass_dir, "tools", "util", "include"
+        )
+        cutlass_extensions_include_dir = os.path.join(cwd, extensions_cuda_dir)
+    if use_cutlass:
+        extra_compile_args["nvcc"].extend(
+            [
+                "-DTORCHAO_USE_CUTLASS",
+                "-I" + cutlass_include_dir,
+                "-I" + cutlass_tools_include_dir,
+                "-I" + cutlass_extensions_include_dir,
+                "-DCUTE_USE_PACKED_TUPLE=1",
+                "-DCUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED",
+                "-DCUTLASS_ENABLE_TENSOR_CORE_MMA=1",
+                "-DCUTLASS_DEBUG_TRACE_LEVEL=0",
+                "--ftemplate-backtrace-limit=0",
+                # "--keep",
+                # "--ptxas-options=--verbose,--register-usage-level=5,--warn-on-local-memory-usage",
+                # "--resource-usage",
+                # "-lineinfo",
+                # "-DCUTLASS_ENABLE_GDC_FOR_SM90",  # https://github.com/NVIDIA/cutlass/blob/main/media/docs/dependent_kernel_launch.md
+            ]
+        )
+
+        cuda_arch_flags = _get_cuda_arch_flags()
+        build_for_sm90 = "-gencode=arch=compute_90,code=sm_90" in cuda_arch_flags
+        build_for_sm90a = "-gencode=arch=compute_90a,code=sm_90a" in cuda_arch_flags
+        if build_for_sm90 and not build_for_sm90a:
+            cutlass_90a_sources = [
+                os.path.join(
+                    extensions_cuda_dir,
+                    "rowwise_scaled_linear_sparse_cutlass",
+                    "rowwise_scaled_linear_sparse_cutlass_f8f8.cu",
+                ),
+                os.path.join(
+                    extensions_cuda_dir,
+                    "to_sparse_semi_structured_cutlass_sm9x",
+                    "to_sparse_semi_structured_cutlass_sm9x_f8.cu",
+                ),
+            ]
+            for dtypes in ["e4m3e4m3", "e4m3e5m2", "e5m2e4m3", "e5m2e5m2"]:
+                cutlass_90a_sources.append(
+                    os.path.join(
+                        extensions_cuda_dir,
+                        "rowwise_scaled_linear_sparse_cutlass",
+                        "rowwise_scaled_linear_sparse_cutlass_" + dtypes + ".cu",
+                    )
+                )
+            sources = [s for s in sources if s not in cutlass_90a_sources]
+    else:
+        # Remove CUTLASS-based kernels from the sources list.  An
+        # assumption is that these files will have "cutlass" in its
+        # name.
+        cutlass_sources = list(
+            glob.glob(
+                os.path.join(extensions_cuda_dir, "**/*cutlass*.cu"), recursive=True
+            )
+        )
+        sources = [s for s in sources if s not in cutlass_sources]
+
     ext_modules = []
     if len(sources) > 0:
         ext_modules.append(
@@ -384,6 +428,21 @@ def get_extensions():
             )
         )
 
+    if cutlass_90a_sources is not None and len(cutlass_90a_sources) > 0:
+        cutlass_90a_extra_compile_args = copy.deepcopy(extra_compile_args)
+        cutlass_90a_extra_compile_args["nvcc"].extend(
+            cuda_arch_flags + ["-gencode=arch=compute_90a,code=sm_90a"]
+        )
+        ext_modules.append(
+            extension(
+                "torchao._C",
+                cutlass_90a_sources,
+                py_limited_api=True,
+                extra_compile_args=cutlass_90a_extra_compile_args,
+                extra_link_args=extra_link_args,
+            )
+        )
+
     if build_torchao_experimental:
         build_options = BuildOptions()