Fix bug with tensor descriptor and small block size (#296)

jansel · web-flow · commit 03b58ca24a79 · 2025-07-13T08:51:12.000-07:00
diff --git a/helion/_compiler/compile_environment.py b/helion/_compiler/compile_environment.py
@@ -441,7 +441,7 @@ def symbol(self) -> sympy.Symbol:
         return self.var._sympy_()
 
     def from_config(self, config: Config) -> int | torch.SymInt | None:
-        return self.block_size_source.from_config(config, self.block_id)
+        return self.block_size_source.from_config(config, self)
 
     def from_config_assert(self, config: Config) -> int | torch.SymInt:
         val = self.from_config(config)
@@ -461,7 +461,9 @@ def update_min_block(self, value: int, *, allow_flattened: bool = True) -> None:
 
 
 class BlockSizeSource:
-    def from_config(self, config: Config, block_id: int) -> int | torch.SymInt | None:
+    def from_config(
+        self, config: Config, block_size_info: BlockSizeInfo
+    ) -> int | torch.SymInt | None:
         raise NotImplementedError
 
     def l2_grouping(self, config: Config) -> int:
@@ -472,15 +474,17 @@ def l2_grouping(self, config: Config) -> int:
 class FixedBlockSizeSource(BlockSizeSource):
     value: int | torch.SymInt
 
-    def from_config(self, config: Config, block_id: int) -> int | torch.SymInt:
+    def from_config(
+        self, config: Config, block_size_info: BlockSizeInfo
+    ) -> int | torch.SymInt:
         return self.value
 
 
 @dataclasses.dataclass
 class LoopSpecBlockSizeSource(BlockSizeSource):
-    def from_config(self, config: Config, block_id: int) -> int:
+    def from_config(self, config: Config, block_size_info: BlockSizeInfo) -> int:
         index = CompileEnvironment.current().config_spec.block_sizes.block_id_to_index(
-            block_id
+            block_size_info.block_id
         )
         return config.block_sizes[index]
 
@@ -489,7 +493,12 @@ def from_config(self, config: Config, block_id: int) -> int:
 class ReductionLoopBlockSizeSource(BlockSizeSource):
     reduction_loop: int
 
-    def from_config(self, config: Config, block_id: int) -> int | None:
+    def from_config(self, config: Config, block_size_info: BlockSizeInfo) -> int | None:
+        if (
+            len(config.reduction_loops) <= self.reduction_loop
+            or config.reduction_loops[self.reduction_loop] is None
+        ):
+            return next_power_of_2(block_size_info.size_hint())
         return config.reduction_loops[self.reduction_loop]
 
 
diff --git a/helion/_compiler/indexing_strategy.py b/helion/_compiler/indexing_strategy.py
@@ -12,6 +12,7 @@
 from .. import exc
 from .ast_extension import expr_from_string
 from .compile_environment import CompileEnvironment
+from .device_function import DeviceFunction
 from .host_function import HostFunction
 from .tile_strategy import DeviceLoopState
 from .variable_origin import BlockSizeOrigin
@@ -178,9 +179,40 @@ def is_supported(
                 byte_stride = stride * element_size
                 if byte_stride % 16 != 0:
                     return False
+        if stride_one_count != 1:
+            # There should be exactly one dimension with stride==1
+            return False
+
+        def valid_block_size(
+            block_size: int | torch.SymInt | None, stride: int | torch.SymInt
+        ) -> bool:
+            if not isinstance(block_size, int):
+                return False
+            # was getting some IMAs with small block sizes even in non-stride 1 dims
+            return block_size * element_size >= 16 or (block_size == 1 and stride != 1)
+
+        # 4) Check minimum 16 bytes in each dimension
+        size_stride = collections.deque(
+            zip(fake_tensor.size(), fake_tensor.stride(), strict=True)
+        )
+        config = DeviceFunction.current().config
+        for k in subscript:
+            if k is None:
+                continue
+            size, stride = size_stride.popleft()
+            if str(k) == "slice(None, None, None)":
+                block_size = env.allocate_reduction_dimension(size).from_config(config)
+                if not valid_block_size(block_size, stride):
+                    return False
+            elif isinstance(k, torch.SymInt):
+                block_id = env.get_block_id(k)
+                if block_id is None:
+                    return False
+                block_size = env.block_sizes[block_id].from_config(config)
+                if not valid_block_size(block_size, stride):
+                    return False
 
-        # TODO(jansel): check that base_ptr is aligned to 16 bytes
-        return stride_one_count == 1
+        return True
 
     def codegen_load(
         self,
diff --git a/test/test_tensor_descriptor.py b/test/test_tensor_descriptor.py
@@ -109,27 +109,11 @@ def kernel_3d_permutation(x: torch.Tensor) -> torch.Tensor:
         base_tensor = torch.randn(storage_size, device=DEVICE, dtype=torch.float32)
         x = base_tensor.as_strided([4, 8, 4], [64, 1, 4])
 
-        # Verify stride pattern - middle dimension should have stride 1, others 16-byte aligned
-        self.assertEqual(x.stride(), (64, 1, 4))  # Expected stride pattern
-        self.assertEqual(x.stride()[1], 1)  # middle dimension has stride 1
-
-        # Check 16-byte alignment for non-stride-1 dimensions
-        element_size = x.element_size()
-        for dim in range(x.ndim):
-            stride = x.stride(dim)
-            if stride != 1:
-                byte_stride = stride * element_size
-                self.assertEqual(
-                    byte_stride % 16,
-                    0,
-                    f"Dim {dim} not 16-byte aligned: stride={stride}, byte_stride={byte_stride}",
-                )
-
         code, result = code_and_output(
             kernel_3d_permutation,
             (x,),
             indexing="tensor_descriptor",
-            block_sizes=[2, 4, 2],
+            block_sizes=[8, 8, 8],
         )
 
         # Check correctness
@@ -288,6 +272,41 @@ def test_attention_td_dynamic(self):
             )
         )
 
+    @unittest.skipUnless(
+        supports_tensor_descriptor(), "Tensor descriptor support is required"
+    )
+    def test_minimum_16_byte_block_size_fallback(self):
+        """Test that tensor descriptor falls back when block size is too small."""
+
+        @helion.kernel(use_default_config=True)
+        def kernel_small_block(x: torch.Tensor) -> torch.Tensor:
+            result = torch.zeros_like(x)
+            for tile in hl.tile(x.size()):
+                result[tile] = x[tile] + 1.0
+            return result
+
+        # Create a tensor with proper stride alignment
+        x = torch.randn([8, 16], device=DEVICE, dtype=torch.float32)
+
+        # Use small block sizes that would result in < 16 bytes in last dimension
+        # block_sizes=[4, 2] means last dimension block size = 2
+        # 2 * 4 bytes (float32) = 8 bytes < 16 bytes required
+        # With the fix, this should fall back to another indexing strategy
+        code, result = code_and_output(
+            kernel_small_block,
+            (x,),
+            indexing="tensor_descriptor",  # Request tensor descriptor
+            block_sizes=[4, 2],  # Small block size in last dimension
+        )
+
+        # Should fall back to block_ptr or pointer indexing instead of tensor descriptor
+        # If our fix works, this should NOT contain tensor descriptor
+        self.assertNotIn("tl.make_tensor_descriptor", code)
+
+        # But should still work correctly
+        expected = x + 1.0
+        torch.testing.assert_close(result, expected)
+
 
 if __name__ == "__main__":
     unittest.main()