Add cross_entropy example and unit test

yf225 · yf225 · commit a1e7a92de391 · 2025-07-14T13:05:50.000-07:00
diff --git a/examples/cross_entropy.py b/examples/cross_entropy.py
@@ -0,0 +1,80 @@
+"""Cross entropy with index computation that works within Helion's constraints."""
+
+from __future__ import annotations
+
+import torch
+
+import helion
+from helion._testing import run_example
+import helion.language as hl
+from helion.utils import get_gpu_memory_info
+
+# TritonBench configuration - adjust based on available GPU memory
+if get_gpu_memory_info()[0] < 16.0:
+    # Low memory configuration for GPUs with less than 16GB
+    TRITONBENCH_ARGS = {"B": 4, "T": 512, "v_range": "10,15"}
+
+
+@helion.kernel(ignore_warnings=[helion.exc.TensorOperationInWrapper])
+def cross_entropy(
+    logits: torch.Tensor,  # [N, V] input logits
+    labels: torch.Tensor,  # [N] target labels
+) -> torch.Tensor:
+    n, v = logits.shape
+    losses = torch.zeros([n], dtype=logits.dtype, device=logits.device)
+
+    # Pre-compute base indices: [0, V, 2V, 3V, ...]
+    base_indices = torch.arange(n, device=logits.device) * v
+
+    # Flatten logits once at the beginning
+    logits_flat = logits.view(-1)
+
+    for tile_n in hl.tile(n):
+        # Get data for this tile
+        labels_tile = labels[tile_n]  # [tile_size]
+        base_indices_tile = base_indices[tile_n]  # [tile_size]
+
+        # Compute the actual flat indices by adding the label offset
+        # flat_index[i] = base_indices[i] + labels[i] = i*V + labels[i]
+        flat_indices = base_indices_tile + labels_tile
+
+        # Load the logits at the target indices
+        logits_at_target = hl.load(logits_flat, [flat_indices])
+
+        # Compute log_softmax for numerical stability
+        # Load the full rows for this tile
+        logits_rows = logits[tile_n, :]  # [tile_size, V]
+
+        # Compute log-sum-exp
+        max_logits = torch.amax(logits_rows, dim=-1, keepdim=True)
+        shifted = logits_rows - max_logits
+        exp_shifted = torch.exp(shifted)
+        sum_exp = torch.sum(exp_shifted, dim=-1, keepdim=True)
+        log_sum_exp = max_logits.squeeze(-1) + torch.log(sum_exp.squeeze(-1))
+
+        # Cross entropy loss: log_sum_exp - logit_at_target
+        losses[tile_n] = log_sum_exp - logits_at_target
+
+    return losses.mean()
+
+
+def main() -> None:
+    """Run cross entropy benchmark with different input sizes."""
+    # Test with moderate size
+    n, v = 128, 1000
+    logits = torch.randn(n, v, device="cuda", dtype=torch.float32)
+    labels = torch.randint(0, v, (n,), device="cuda", dtype=torch.long)
+
+    run_example(
+        cross_entropy,
+        torch.nn.functional.cross_entropy,
+        (logits, labels),
+        kernel_name="helion",
+        baseline_name="torch",
+        rtol=1e-4,
+        atol=1e-4,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/test_examples.expected b/test/test_examples.expected
@@ -459,6 +459,57 @@ def _concat2d_dim1_make_precompiler(x: torch.Tensor, y: torch.Tensor):
     from helion.runtime.precompile_shim import make_precompiler
     return make_precompiler(_concat2d_dim1_kernel)(x, out, y, out.size(0), out.size(1), x.size(0), x.size(1), out.stride(0), out.stride(1), x.stride(0), x.stride(1), y.stride(0), y.stride(1), _BLOCK_SIZE_0, _BLOCK_SIZE_1, num_warps=4, num_stages=3)
 
+--- assertExpectedJournal(TestExamples.test_cross_entropy)
+from __future__ import annotations
+
+import torch
+import triton
+import triton.language as tl
+from torch._inductor.runtime.triton_helpers import math as tl_math
+
+@triton.jit
+def _cross_entropy_kernel(labels, base_indices, logits_flat, logits, losses, base_indices_stride_0, labels_stride_0, logits_stride_0, logits_stride_1, logits_flat_stride_0, losses_stride_0, v, _RDIM_SIZE_1: tl.constexpr):
+    pid_0 = tl.program_id(0)
+    offset_0 = pid_0
+    indices_0 = offset_0 + tl.zeros([1], tl.int32)
+    indices_1 = tl.arange(0, _RDIM_SIZE_1).to(tl.int32)
+    mask_1 = indices_1 < v
+    labels_tile = tl.load(labels + indices_0 * labels_stride_0, None)
+    base_indices_tile = tl.load(base_indices + indices_0 * base_indices_stride_0, None)
+    v_0 = base_indices_tile + labels_tile
+    logits_at_target = tl.load(logits_flat + v_0 * logits_flat_stride_0, None)
+    logits_rows = tl.load(logits + (indices_0[:, None] * logits_stride_0 + indices_1[None, :] * logits_stride_1), mask_1[None, :], other=0)
+    _mask_to = tl.where(tl.broadcast_to(mask_1[None, :], [1, _RDIM_SIZE_1]), logits_rows, float('-inf'))
+    max_logits = tl.reshape(tl.max(_mask_to, 1), [1, 1])
+    v_1 = logits_rows - max_logits
+    v_2 = tl_math.exp(v_1)
+    _mask_to_1 = tl.where(tl.broadcast_to(mask_1[None, :], [1, _RDIM_SIZE_1]), v_2, 0)
+    sum_exp = tl.reshape(tl.sum(_mask_to_1, 1), [1, 1])
+    squeeze = tl.reshape(max_logits, [1])
+    squeeze_1 = tl.reshape(sum_exp, [1])
+    v_3 = tl_math.log(squeeze_1)
+    v_4 = squeeze + v_3
+    v_5 = v_4 - logits_at_target
+    tl.store(losses + indices_0 * losses_stride_0, v_5, None)
+
+def cross_entropy(logits: torch.Tensor, labels: torch.Tensor):
+    n, v = logits.shape
+    losses = torch.zeros([n], dtype=logits.dtype, device=logits.device)
+    base_indices = torch.arange(n, device=logits.device) * v
+    logits_flat = logits.view(-1)
+    _RDIM_SIZE_1 = triton.next_power_of_2(v)
+    _cross_entropy_kernel[n,](labels, base_indices, logits_flat, logits, losses, base_indices.stride(0), labels.stride(0), logits.stride(0), logits.stride(1), logits_flat.stride(0), losses.stride(0), v, _RDIM_SIZE_1, num_warps=4, num_stages=3)
+    return losses.mean()
+
+def _cross_entropy_make_precompiler(logits: torch.Tensor, labels: torch.Tensor):
+    n, v = logits.shape
+    losses = torch.zeros([n], dtype=logits.dtype, device=logits.device)
+    base_indices = torch.arange(n, device=logits.device) * v
+    logits_flat = logits.view(-1)
+    _RDIM_SIZE_1 = triton.next_power_of_2(v)
+    from helion.runtime.precompile_shim import make_precompiler
+    return make_precompiler(_cross_entropy_kernel)(labels, base_indices, logits_flat, logits, losses, base_indices.stride(0), labels.stride(0), logits.stride(0), logits.stride(1), logits_flat.stride(0), losses.stride(0), v, _RDIM_SIZE_1, num_warps=4, num_stages=3)
+
 --- assertExpectedJournal(TestExamples.test_embedding_block_ptr)
 from __future__ import annotations
 
diff --git a/test/test_examples.py b/test/test_examples.py
@@ -267,6 +267,20 @@ def test_softmax_two_pass_block_ptr(self):
             )
         )
 
+    def test_cross_entropy(self):
+        n, v = 128, 1000
+        args = (
+            torch.randn(n, v, device=DEVICE, dtype=torch.float32),
+            torch.randint(0, v, (n,), device=DEVICE, dtype=torch.long),
+        )
+        self.assertExpectedJournal(
+            check_example(
+                "cross_entropy",
+                args,
+                torch.nn.functional.cross_entropy(*args),
+            )
+        )
+
     def test_rms_norm(self):
         args = (
             torch.randn([128, 256], device=DEVICE, dtype=torch.float16),