Improve Tensor.item() handling (#307)

jansel · web-flow · commit b9e76dc4eae7 · 2025-07-13T14:55:17.000-07:00
diff --git a/helion/_compiler/type_propagation.py b/helion/_compiler/type_propagation.py
@@ -581,6 +581,19 @@ def propagate_call(
                 raise exc.TypeInferenceError(
                     f"Tensor.{attr}() args must be literals"
                 ) from None
+        if attr == "item" and not (args or kwargs):
+            if origin.is_device():
+                raise exc.NotAllowedOnDevice("Tensor.item()")
+            if self.tensor.fake_value.numel() != 1:
+                raise exc.TypeInferenceError("Tensor.item() requires numel() == 1")
+            dtype = self.tensor.fake_value.dtype
+            if dtype.is_complex:
+                raise exc.TypeInferenceError("Complex tensors not supported")
+            if dtype.is_floating_point:
+                return SymFloatType.new_unbacked(origin)
+            if dtype == torch.bool:
+                return SymBoolType.new_unbacked(origin)
+            return SymIntType.new_unbacked(origin)
 
         proxy_args = [x.tree_map(_to_proxy) for x in args]
         proxy_kwargs = {k: v.tree_map(_to_proxy) for k, v in kwargs.items()}
diff --git a/test/test_misc.expected b/test/test_misc.expected
@@ -53,6 +53,37 @@ def _kernel_make_precompiler(a_list, b_dict, b_tuple, c_named_tuple, d_dataclass
     from helion.runtime.precompile_shim import make_precompiler
     return make_precompiler(_kernel_kernel)(a0, o0, o1, a0.size(0), a0.stride(0), o0.stride(0), o1.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=3)
 
+--- assertExpectedJournal(TestMisc.test_scalar_tensor_item_method)
+from __future__ import annotations
+
+import torch
+import triton
+import triton.language as tl
+
+@triton.jit
+def _kernel_with_scalar_item_kernel(x, result, x_size_0, result_stride_0, x_stride_0, scalar_val, _BLOCK_SIZE_0: tl.constexpr):
+    pid_0 = tl.program_id(0)
+    offset_0 = pid_0 * _BLOCK_SIZE_0
+    indices_0 = (offset_0 + tl.arange(0, _BLOCK_SIZE_0)).to(tl.int32)
+    mask_0 = indices_0 < x_size_0
+    load = tl.load(x + indices_0 * x_stride_0, mask_0, other=0)
+    v_0 = load + scalar_val
+    tl.store(result + indices_0 * result_stride_0, v_0, mask_0)
+
+def kernel_with_scalar_item(x: torch.Tensor, scalar_tensor: torch.Tensor):
+    result = torch.empty_like(x)
+    scalar_val = scalar_tensor.item()
+    _BLOCK_SIZE_0 = 128
+    _kernel_with_scalar_item_kernel[triton.cdiv(x.size(0), _BLOCK_SIZE_0),](x, result, x.size(0), result.stride(0), x.stride(0), scalar_val, _BLOCK_SIZE_0, num_warps=4, num_stages=3)
+    return result
+
+def _kernel_with_scalar_item_make_precompiler(x: torch.Tensor, scalar_tensor: torch.Tensor):
+    result = torch.empty_like(x)
+    scalar_val = scalar_tensor.item()
+    _BLOCK_SIZE_0 = 128
+    from helion.runtime.precompile_shim import make_precompiler
+    return make_precompiler(_kernel_with_scalar_item_kernel)(x, result, x.size(0), result.stride(0), x.stride(0), scalar_val, _BLOCK_SIZE_0, num_warps=4, num_stages=3)
+
 --- assertExpectedJournal(TestMisc.test_tile_block_size_constexpr_fix)
 from __future__ import annotations
 
diff --git a/test/test_misc.py b/test/test_misc.py
@@ -287,6 +287,32 @@ def kernel_no_config(x: torch.Tensor) -> torch.Tensor:
             "no config provided and no implicit config available", str(cm.exception)
         )
 
+    def test_scalar_tensor_item_method(self):
+        """Test using scalar_tensor.item() to extract scalar value in kernel"""
+
+        @helion.kernel(use_default_config=True)
+        def kernel_with_scalar_item(
+            x: torch.Tensor, scalar_tensor: torch.Tensor
+        ) -> torch.Tensor:
+            result = torch.empty_like(x)
+            scalar_val = scalar_tensor.item()
+            for tile in hl.tile(x.shape):
+                result[tile] = x[tile] + scalar_val
+            return result
+
+        x = torch.randn(100, device=DEVICE)
+        code, result = code_and_output(
+            kernel_with_scalar_item, (x, torch.tensor(5.0, device=DEVICE))
+        )
+        self.assertExpectedJournal(code)
+        torch.testing.assert_close(result, x + 5)
+
+        code2, result2 = code_and_output(
+            kernel_with_scalar_item, (x, torch.tensor(10.0, device=DEVICE))
+        )
+        self.assertEqual(code, code2)
+        torch.testing.assert_close(result2, x + 10)
+
 
 if __name__ == "__main__":
     unittest.main()