Use tensor device reference in persistent kernels (#317)

jansel · web-flow · commit 566045a9c15f · 2025-07-14T17:34:12.000-07:00
diff --git a/helion/_compiler/program_id.py b/helion/_compiler/program_id.py
@@ -420,13 +420,23 @@ def __init__(self, is_blocked: bool = False) -> None:
                 "step": NUM_SM_VAR,
             }
         if device_function.constexpr_arg(NUM_SM_VAR):
-            device = CompileEnvironment.current().device
             device_function.codegen.host_statements.append(
                 statement_from_string(
-                    f"{NUM_SM_VAR} = helion.runtime.get_num_sm(torch.{device!r})"
+                    f"{NUM_SM_VAR} = helion.runtime.get_num_sm({self.get_device_str()})"
                 )
             )
 
+    def get_device_str(self) -> str:
+        """Get the device string for the current device, reusing the first tensor's origin."""
+        host_function = HostFunction.current()
+        device = CompileEnvironment.current().device
+        origins = [
+            o for t, o in host_function.tensor_to_origin.items() if t.device == device
+        ]
+        if origins:
+            return f"{origins[0].host_str()}.device"
+        return f"torch.{device!r}"
+
     def codegen_grid(self) -> ast.AST:
         # Use num_sms for persistent kernels
         return expr_from_string(f"({NUM_SM_VAR},)")
diff --git a/test/test_examples.expected b/test/test_examples.expected
@@ -259,7 +259,7 @@ def attention(q_in: torch.Tensor, k_in: torch.Tensor, v_in: torch.Tensor, *, _la
     v_view = v_in.reshape([-1, n_dim, head_dim])
     k_view = k_in.reshape([-1, n_dim, head_dim]).transpose(1, 2)
     out = torch.empty_like(q_view)
-    _NUM_SM = helion.runtime.get_num_sm(torch.device(type='cuda', index=0))
+    _NUM_SM = helion.runtime.get_num_sm(q_in.device)
     _BLOCK_SIZE_1 = 64
     _BLOCK_SIZE_3 = 64
     _launcher(_attention_kernel, (_NUM_SM,), q_view, k_view, v_view, out, _NUM_SM, _BLOCK_SIZE_1, _BLOCK_SIZE_3, num_warps=4, num_stages=3)
diff --git a/test/test_persistent_kernels.expected b/test/test_persistent_kernels.expected