`_cutlass_implicit_gemm_forward_logic` with ConvBlock

Hi, @chrischoy 

I'm testing the MinkUNet model and encountered an issue:
```sh
CuPy allocator set to PyTorch memory pool.
[INFO] benchmark_cache.py:203 - Loaded benchmark cache v2.0: 3 total configurations
[INFO] sparse_conv.py:307 - Loaded 3 forward and 0 backward benchmark configurations from cache
coordinates shape: torch.Size([131072, 3])
features shape: torch.Size([131072, 3])
offsets shape: torch.Size([3])
points shape: torch.Size([2, 4, 128, 128, 3])
images shape: torch.Size([2, 4, 128, 128, 3])

Traceback (most recent call last):
  File "/home/chenyu/anaconda3/envs/dogs/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/home/chenyu/anaconda3/envs/dogs/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/home/chenyu/Projects/DOGE/conerf/model/backbone/warp_conv_minkunet.py", line 390, in <module>
    outputs = model(sparse_tensor)
  File "/home/chenyu/anaconda3/envs/dogs/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/chenyu/anaconda3/envs/dogs/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/chenyu/Projects/DOGE/conerf/model/backbone/warp_conv_minkunet.py", line 168, in forward
    out = self.conv1(out_p1)
  File "/home/chenyu/anaconda3/envs/dogs/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/chenyu/anaconda3/envs/dogs/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/chenyu/anaconda3/envs/dogs/lib/python3.10/site-packages/warpconvnet-0.3.5-py3.10-linux-x86_64.egg/warpconvnet/nn/modules/sequential.py", line 56, in forward
    x, in_sf = run_forward(module, x, in_sf)
  File "/home/chenyu/anaconda3/envs/dogs/lib/python3.10/site-packages/warpconvnet-0.3.5-py3.10-linux-x86_64.egg/warpconvnet/nn/modules/sequential.py", line 15, in run_forward
    return module(x), in_sf
  File "/home/chenyu/anaconda3/envs/dogs/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/chenyu/anaconda3/envs/dogs/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/chenyu/anaconda3/envs/dogs/lib/python3.10/site-packages/warpconvnet-0.3.5-py3.10-linux-x86_64.egg/warpconvnet/nn/modules/sparse_conv.py", line 200, in forward
    return spatially_sparse_conv(
  File "/home/chenyu/anaconda3/envs/dogs/lib/python3.10/site-packages/warpconvnet-0.3.5-py3.10-linux-x86_64.egg/warpconvnet/nn/functional/sparse_conv.py", line 1981, in spatially_sparse_conv
    out_feature_tensor = UnifiedSpatiallySparseConvFunction.apply(
  File "/home/chenyu/anaconda3/envs/dogs/lib/python3.10/site-packages/torch/autograd/function.py", line 575, in apply
    return super().apply(*args, **kwargs)  # type: ignore[misc]
  File "/home/chenyu/anaconda3/envs/dogs/lib/python3.10/site-packages/warpconvnet-0.3.5-py3.10-linux-x86_64.egg/warpconvnet/nn/functional/sparse_conv.py", line 1663, in forward
    raise RuntimeError(
RuntimeError: Error in _cutlass_implicit_gemm_forward_logic: Kernel execution failed
```

My code for testing:
```python
import torch

from warpconvnet.geometry.types.points import Points
from warpconvnet.geometry.types.voxels import Voxels
from warpconvnet.nn.functional.point_pool import point_pool


def colored_pc_to_warp_conv_points(
    point_coordinates: torch.Tensor,
    point_features: torch.Tensor,
):
    """
    Process colored point cloud to voxelized sparse tensor using Point2Voxel

    Args:
        point_cloud: (B, F, H, W, 3) tensor - XYZ
        point_features: (B, F, H, W, 3) tensor - RGB

    Returns:
        points: warpconvnet points
    """
    device = point_coordinates.device
    B, F, H, W, _ = point_coordinates.shape

    # 1. Reshape and filter valid points
    point_cloud = point_coordinates.reshape(B, F*H*W, 3)
    point_feats = point_features.reshape(B, F*H*W, 3)
    valid_mask = (point_cloud[..., :3].abs().sum(-1) > 1e-6)  # XYZ not zero

    all_coordinates, all_features, offsets = [], [], [0]
    for b in range(B):
        batch_mask = valid_mask[b]
        if not batch_mask.any():
            continue

        batch_points = point_cloud[b][batch_mask]
        batch_features = point_feats[b][batch_mask]
        offset = offsets[-1] + batch_points.shape[0]

        all_coordinates.append(batch_points)
        all_features.append(batch_features)
        offsets.append(offset)

    # Combine all batches
    coordinates = torch.cat(all_coordinates).to(device)
    features = torch.cat(all_features).to(device)
    offsets = torch.tensor(offsets).to(device)
    print(f'coordinates shape: {coordinates.shape}')
    print(f'features shape: {features.shape}')
    print(f'offsets shape: {offsets.shape}')

    return Points(coordinates, features, offsets.squeeze(-1))


if __name__ == "__main__":
    seed = 100
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    model = MinkUNet18(in_channels=3, out_channels=64).to("cuda")

    # Inputs: dense point cloud + optional RGB
    points = torch.randn(2, 4, 128, 128, 3).to("cuda")  # [B, F, H, W, 3]
    images = torch.randn(2, 4, 128, 128, 3).to("cuda")  # [B, F, H, W, 3]

    point_cloud = colored_pc_to_warp_conv_points(points, images)

    print(f'points shape: {points.shape}')
    print(f'images shape: {images.shape}\n')

    sparse_tensor, to_unique = point_pool(
        point_cloud,
        reduction="mean",
        downsample_voxel_size=0.02,
        return_type="voxel",
        return_to_unique=True,
    )

    outputs = model(sparse_tensor)
```

My testing environments:
```txt
pytorch: '2.6.0+cu124'
nvcc: Build cuda_12.3.r12.3/compiler.33567101_0
```


Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

`_cutlass_implicit_gemm_forward_logic` with ConvBlock #10

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

_cutlass_implicit_gemm_forward_logic with ConvBlock #10

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions

`_cutlass_implicit_gemm_forward_logic` with ConvBlock #10