dense bmm support

yf225 · yf225 · commit 4ab101e1d8a9 · 2025-05-15T14:19:05.000-07:00
ghstack-source-id: f7286f4 Pull-Request-resolved: #39
diff --git a/examples/bmm.py b/examples/bmm.py
@@ -0,0 +1,43 @@
+from __future__ import annotations
+
+import torch
+
+import helion
+import helion.language as hl
+
+
+# static_shapes=True gives a performance boost for matmuls
+@helion.kernel(static_shapes=True)
+def bmm(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
+    # A: [B, M, K], B: [B, K, N], Out: [B, M, N]   # dense bmm
+    b, m, k = A.size()
+    b, k, n = B.size()
+    out = torch.empty(
+        [b, m, n], device=A.device, dtype=torch.promote_types(A.dtype, B.dtype)
+    )
+    for tile_b, tile_m, tile_n in hl.tile([b, m, n]):
+        acc = hl.zeros([tile_b, tile_m, tile_n], dtype=torch.float32)
+        for tile_k in hl.tile(k):
+            acc = torch.baddbmm(
+                acc, A[tile_b, tile_m, tile_k], B[tile_b, tile_k, tile_n]
+            )
+        out[tile_b, tile_m, tile_n] = acc
+    return out
+
+
+def check(b: int, m: int, k: int, n: int) -> None:
+    from triton.testing import do_bench
+
+    x = torch.randn([b, m, k], device="cuda", dtype=torch.float16)
+    y = torch.randn([b, k, n], device="cuda", dtype=torch.float16)
+    result = bmm(x, y)
+    torch.testing.assert_close(result, x @ y, rtol=1e-2, atol=1e-1)
+    sec = do_bench(lambda: bmm(x, y))
+    baseline_sec = do_bench(lambda: torch.bmm(x, y))
+    print(
+        f"Helion time: {sec:.4f}s, torch time: {baseline_sec:.4f}, speedup: {baseline_sec / sec:.2f}x"
+    )
+
+
+if __name__ == "__main__":
+    check(16, 512, 768, 1024)
diff --git a/helion/_compiler/inductor_lowering.py b/helion/_compiler/inductor_lowering.py
@@ -584,8 +584,12 @@ def apply_dot_requirements(handler: CodegenHandler, node: torch.fx.Node) -> Lowe
     lproxy, rproxy = map_arg(node.args[-2:], lambda arg: arg.meta["val"])
     assert isinstance(lproxy, torch.Tensor)
     assert isinstance(rproxy, torch.Tensor)
-    n, k = lproxy.size()
-    _, m = rproxy.size()
+    lshape = lproxy.size()
+    rshape = rproxy.size()
+    # use last two dimensions for dot (supports 2D and batched 3D tensors)
+    n, k = lshape[-2], lshape[-1]
+    k2, m = rshape[-2], rshape[-1]
+    assert k == k2, f"Mismatched k dimensions for dot: {k} vs {k2}"
     a, b, c = min_dot_size(lproxy.device, lproxy.dtype, rproxy.dtype)
     env = CompileEnvironment.current()
     for shape, min_size in [(n, a), (k, b), (m, c)]:
@@ -625,6 +629,23 @@ def codegen_addmm(ctx: GraphInterpreter, node: torch.fx.Node) -> ast.AST:
     )
 
 
+# pyre-fixme[56]
+@register_lowering(torch.ops.aten.baddbmm.default, apply_dot_requirements)
+def codegen_baddbmm(ctx: GraphInterpreter, node: torch.fx.Node) -> ast.AST:
+    assert not node.kwargs, "baddbmm kwargs not supported"
+    acc, lhs, rhs = map_arg(node.args, lambda arg: ctx.env[arg])
+    assert isinstance(acc, ast.AST)
+    assert isinstance(lhs, ast.AST)
+    assert isinstance(rhs, ast.AST)
+    tf32 = CompileEnvironment.current().settings.dot_precision
+    return expr_from_string(
+        f"tl.dot(lhs, rhs, acc=acc, input_precision={tf32!r})",
+        lhs=lhs,
+        rhs=rhs,
+        acc=acc,
+    )
+
+
 class GenerateASTFromInductor(DefaultHandler):
     def __init__(self, cg: GenerateAST, input_name_lookup: dict[str, ast.AST]) -> None:
         super().__init__()
diff --git a/helion/runtime/kernel.py b/helion/runtime/kernel.py
@@ -247,9 +247,12 @@ def __init__(self, kernel: Kernel, args: tuple[object, ...]) -> None:
                     constexpr_args[name] = arg
                 else:
                     self.fake_args.append(self.env.to_fake(arg, ArgumentOrigin(name)))
-            self.host_fn: HostFunction = HostFunction(
-                self.kernel.fn, self.fake_args, constexpr_args
-            )
+            with torch.fx.experimental._config.patch(  # pyre-ignore[16]
+                skip_dtype_check_in_meta_registrations=True
+            ):
+                self.host_fn: HostFunction = HostFunction(
+                    self.kernel.fn, self.fake_args, constexpr_args
+                )
         if len(kernel.configs) == 1:
             self.set_config(kernel.configs[0])
 
diff --git a/test/test_examples.py b/test/test_examples.py
@@ -141,6 +141,72 @@ def _matmul_make_precompiler(x: torch.Tensor, y: torch.Tensor):
     return make_precompiler(_matmul_kernel)(x, y, out, _BLOCK_SIZE_0, _BLOCK_SIZE_1, _BLOCK_SIZE_2, num_warps=4, num_stages=3)""",
         )
 
+    def test_bmm(self):
+        args = (
+            torch.randn([16, 512, 768], device=DEVICE, dtype=torch.float16),
+            torch.randn([16, 768, 1024], device=DEVICE, dtype=torch.float16),
+        )
+        self.assertExpectedInline(
+            run_example(
+                "bmm",
+                args,
+                torch.bmm(args[0], args[1]),
+                block_sizes=[[16, 16, 16], 16],
+                l2_grouping=4,
+            ),
+            """\
+from __future__ import annotations
+
+import torch
+import triton
+import triton.language as tl
+
+@triton.jit
+def _bmm_kernel(A, B, out, _BLOCK_SIZE_0: tl.constexpr, _BLOCK_SIZE_1: tl.constexpr, _BLOCK_SIZE_2: tl.constexpr, _BLOCK_SIZE_3: tl.constexpr):
+    num_blocks_0 = tl.cdiv(16, _BLOCK_SIZE_0)
+    num_blocks_1 = tl.cdiv(512, _BLOCK_SIZE_1)
+    pid_0 = tl.program_id(0) % num_blocks_0
+    pid_1 = tl.program_id(0) // num_blocks_0 % num_blocks_1
+    pid_2 = tl.program_id(0) // (num_blocks_0 * num_blocks_1)
+    offset_0 = pid_0 * _BLOCK_SIZE_0
+    indices_0 = offset_0 + tl.arange(0, _BLOCK_SIZE_0).to(tl.int32)
+    offset_1 = pid_1 * _BLOCK_SIZE_1
+    indices_1 = offset_1 + tl.arange(0, _BLOCK_SIZE_1).to(tl.int32)
+    offset_2 = pid_2 * _BLOCK_SIZE_2
+    indices_2 = offset_2 + tl.arange(0, _BLOCK_SIZE_2).to(tl.int32)
+    acc = tl.full([_BLOCK_SIZE_0, _BLOCK_SIZE_1, _BLOCK_SIZE_2], 0.0, tl.float32)
+    for offset_3 in range(0, 768, _BLOCK_SIZE_3):
+        indices_3 = offset_3 + tl.arange(0, _BLOCK_SIZE_3).to(tl.int32)
+        acc_copy = acc
+        load = tl.load(A + (indices_0[:, None, None] * 393216 + indices_1[None, :, None] * 768 + indices_3[None, None, :] * 1), None)
+        load_1 = tl.load(B + (indices_0[:, None, None] * 786432 + indices_3[None, :, None] * 1024 + indices_2[None, None, :] * 1), None)
+        acc = tl.dot(load, load_1, acc=acc_copy, input_precision='tf32')
+    v_0 = acc.to(tl.float16)
+    tl.store(out + (indices_0[:, None, None] * 524288 + indices_1[None, :, None] * 1024 + indices_2[None, None, :] * 1), v_0, None)
+
+def bmm(A: torch.Tensor, B: torch.Tensor):
+    b, m, k = A.size()
+    b, k, n = B.size()
+    out = torch.empty([b, m, n], device=A.device, dtype=torch.promote_types(A.dtype, B.dtype))
+    _BLOCK_SIZE_0 = 16
+    _BLOCK_SIZE_1 = 16
+    _BLOCK_SIZE_2 = 16
+    _BLOCK_SIZE_3 = 16
+    _bmm_kernel[triton.cdiv(16, _BLOCK_SIZE_0) * triton.cdiv(512, _BLOCK_SIZE_1) * triton.cdiv(1024, _BLOCK_SIZE_2),](A, B, out, _BLOCK_SIZE_0, _BLOCK_SIZE_1, _BLOCK_SIZE_2, _BLOCK_SIZE_3, num_warps=4, num_stages=3)
+    return out
+
+def _bmm_make_precompiler(A: torch.Tensor, B: torch.Tensor):
+    b, m, k = A.size()
+    b, k, n = B.size()
+    out = torch.empty([b, m, n], device=A.device, dtype=torch.promote_types(A.dtype, B.dtype))
+    _BLOCK_SIZE_0 = 16
+    _BLOCK_SIZE_1 = 16
+    _BLOCK_SIZE_2 = 16
+    _BLOCK_SIZE_3 = 16
+    from helion.runtime.precompile_shim import make_precompiler
+    return make_precompiler(_bmm_kernel)(A, B, out, _BLOCK_SIZE_0, _BLOCK_SIZE_1, _BLOCK_SIZE_2, _BLOCK_SIZE_3, num_warps=4, num_stages=3)""",
+        )
+
     def test_template_via_closure0(self):
         bias = torch.randn([1, 1024], device=DEVICE, dtype=torch.float16)
         args = (