pytorch-labs
diff --git a/‎tritonbench/operators/gemm/operator.py
Lines changed: 81 additions & 1 deletion b/‎tritonbench/operators/gemm/operator.py
Lines changed: 81 additions & 1 deletion
@@ -11,8 +11,18 @@
 from tritonbench.operators.gemm.kernels import matmul as kernels
 from tritonbench.operators.gemm.partition_k import matmul_partition_k
 from tritonbench.operators.gemm.stream_k import streamk_matmul
+from tritonbench.operators.gemm.warp_spec_persistent_matmul import (
+    blackwell_matmul_descriptor_persistent,
+    blackwell_matmul_tma,
+    blackwell_matmul_tma_persistent,
+)
 from tritonbench.utils.data_utils import get_production_shapes
-from tritonbench.utils.env_utils import is_cuda, is_fbcode, supports_tma
+from tritonbench.utils.env_utils import (
+    get_nvidia_gpu_model,
+    is_cuda,
+    is_fbcode,
+    supports_tma,
+)
 
 from tritonbench.utils.path_utils import REPO_PATH
 
@@ -94,6 +104,8 @@
     for k in [4096 * i for i in range(1, 9)]
 ]
 
+IS_B200 = is_cuda() and get_nvidia_gpu_model() == "NVIDIA B200"
+
 
 @contextlib.contextmanager
 def set_env_variable(key, value):
@@ -350,6 +362,74 @@ def decompose_func(a_in, b_in):
         else:
             return lambda: compiled_decompose_k(a, b)
 
+    if IS_B200:
+
+        @register_benchmark(enabled=False)
+        def triton_blackwell_warpspec_persistent_matmul(self, a, b, bias) -> Callable:
+            if bias is not None:
+                return (
+                    lambda: blackwell_matmul_tma_persistent(a, b, warp_specialize=True)
+                    + bias
+                )
+            else:
+                return lambda: blackwell_matmul_tma_persistent(
+                    a, b, warp_specialize=True
+                )
+
+        @register_benchmark(enabled=False)
+        def triton_blackwell_persistent_matmul(self, a, b, bias) -> Callable:
+            if bias is not None:
+                return (
+                    lambda: blackwell_matmul_tma_persistent(a, b, warp_specialize=False)
+                    + bias
+                )
+            else:
+                return lambda: blackwell_matmul_tma_persistent(
+                    a, b, warp_specialize=False
+                )
+
+        @register_benchmark(enabled=False)
+        def triton_blackwell_warpspec_tma_matmul(self, a, b, bias) -> Callable:
+            if bias is not None:
+                return lambda: blackwell_matmul_tma(a, b, warp_specialize=True) + bias
+            else:
+                return lambda: blackwell_matmul_tma(a, b, warp_specialize=True)
+
+        @register_benchmark(enabled=False)
+        def triton_blackwell_tma_matmul(self, a, b, bias) -> Callable:
+            if bias is not None:
+                return lambda: blackwell_matmul_tma(a, b, warp_specialize=False) + bias
+            else:
+                return lambda: blackwell_matmul_tma(a, b, warp_specialize=False)
+
+        @register_benchmark(enabled=False)
+        def triton_blackwell_warpspec_descriptor_matmul(self, a, b, bias) -> Callable:
+            if bias is not None:
+                return (
+                    lambda: blackwell_matmul_descriptor_persistent(
+                        a, b, warp_specialize=True
+                    )
+                    + bias
+                )
+            else:
+                return lambda: blackwell_matmul_descriptor_persistent(
+                    a, b, warp_specialize=True
+                )
+
+        @register_benchmark(enabled=False)
+        def triton_blackwell_descriptor_matmul(self, a, b, bias) -> Callable:
+            if bias is not None:
+                return (
+                    lambda: blackwell_matmul_descriptor_persistent(
+                        a, b, warp_specialize=False
+                    )
+                    + bias
+                )
+            else:
+                return lambda: blackwell_matmul_descriptor_persistent(
+                    a, b, warp_specialize=False
+                )
+
     @register_x_val(label="(M, N, K)")
     def get_x_val(self, example_inputs) -> Tuple[int, int, int]:
         # x-value: computation intensity