[Refactor] MaskedCategorical cross_entropy usage for faster loss

Vincent Moens · Vincent Moens · commit 3e1f4ff1cdf7 · 2025-04-02T13:25:24.000+01:00
ghstack-source-id: 84330cf Pull Request resolved: #2882
diff --git a/test/test_distributions.py b/test/test_distributions.py
@@ -488,6 +488,46 @@ def test_sample_sparse(self, neg_inf: float) -> None:
         sample_probs = torch.bincount(samples) / num_samples
         torch.testing.assert_close(sample_probs, ref_probs, rtol=1e-5, atol=1e-2)
 
+    @pytest.mark.parametrize("neg_inf", [-1e20, float("-inf")])
+    @pytest.mark.parametrize("sparse", [False, True])
+    @pytest.mark.parametrize("ndim", [2, 1, 3])
+    def test_crossentropy(self, sparse: bool, neg_inf: float, ndim: int):
+        torch.manual_seed(0)
+        logits = torch.randn(4).log_softmax(dim=-1)
+        # probs = logits.exp()
+        mask = torch.tensor([True, False, True, True])
+        indices = torch.tensor([0, 2, 3])
+
+        if ndim >= 2:
+            mask = mask.unsqueeze(0)
+            logits = logits.unsqueeze(0)
+            indices = indices.unsqueeze(0)
+        if ndim == 3:
+            mask = mask.unsqueeze(0)
+            logits = logits.unsqueeze(0)
+            indices = indices.unsqueeze(0)
+
+        dist_ce = MaskedCategorical(
+            logits=logits,
+            neg_inf=neg_inf,
+            mask=mask if not sparse else None,
+            indices=indices if sparse else None,
+            use_cross_entropy=True,
+        )
+        dist = MaskedCategorical(
+            logits=logits,
+            neg_inf=neg_inf,
+            mask=mask if not sparse else None,
+            indices=indices if sparse else None,
+            use_cross_entropy=False,
+        )
+        data = torch.tensor(0)
+        if ndim >= 2:
+            data = data.unsqueeze(0)
+        if ndim == 3:
+            data = data.unsqueeze(0)
+        torch.testing.assert_close(dist.log_prob(data), dist_ce.log_prob(data))
+
 
 class TestOneHotCategorical:
     def test_one_hot(self):
diff --git a/torchrl/modules/__init__.py b/torchrl/modules/__init__.py
@@ -93,7 +93,7 @@
 )
 from .utils import get_primers_from_module
 from .planners import CEMPlanner, MPCPlannerBase, MPPIPlanner  # usort:skip
-from .llm import TransformersWrapper, vLLMWrapper
+from .llm import CategoricalSequential, TransformersWrapper, vLLMWrapper
 
 __all__ = [
     "Actor",
@@ -109,6 +109,7 @@
     "Conv3dNet",
     "ConvNet",
     "DTActor",
+    "CategoricalSequential",
     "DdpgCnnActor",
     "DdpgCnnQNet",
     "DdpgMlpActor",
diff --git a/torchrl/modules/distributions/discrete.py b/torchrl/modules/distributions/discrete.py
@@ -184,6 +184,8 @@ class MaskedCategorical(D.Categorical):
             invalid (out-of-mask) indices. Defaults to -inf.
         padding_value: The padding value in the mask tensor. When
             sparse_mask == True, the padding_value will be ignored.
+        use_cross_entropy (bool, optional): For faster computation of the log-probability,
+            the cross_entropy loss functional can be used. Defaults to ``False``.
 
     Examples:
         >>> torch.manual_seed(0)
@@ -225,6 +227,7 @@ def __init__(
         indices: torch.Tensor = None,
         neg_inf: float = float("-inf"),
         padding_value: int | None = None,
+        use_cross_entropy: bool = False,
     ) -> None:
         if not ((mask is None) ^ (indices is None)):
             raise ValueError(
@@ -247,6 +250,7 @@ def __init__(
             probs = probs / probs.sum(-1, keepdim=True)
             logits = probs.log()
         num_samples = logits.shape[-1]
+        self.use_cross_entropy = use_cross_entropy
         logits = self._mask_logits(
             logits,
             mask,
@@ -282,19 +286,36 @@ def sample(
 
     def log_prob(self, value: torch.Tensor) -> torch.Tensor:
         if not self._sparse_mask:
-            return super().log_prob(value)
+            if self.use_cross_entropy:
+                logits = self.logits
+                if logits.ndim > 2:
+                    # Bring channels in 2nd dim
+                    logits = logits.transpose(-1, 1)
+                result = -torch.nn.functional.cross_entropy(logits, value, reduce=False)
+            else:
+                result = super().log_prob(value)
+            result = torch.where(torch.isfinite(result), result, self.neg_inf)
+            return result
 
         idx_3d = self._mask.view(1, -1, self._num_events)
         val_3d = value.view(-1, idx_3d.size(1), 1)
         mask = idx_3d == val_3d
         idx = mask.int().argmax(dim=-1, keepdim=True)
-        ret = super().log_prob(idx.view_as(value))
+        idx = idx.view_as(value)
+        if self.use_cross_entropy:
+            logits = self.logits
+            if logits.ndim > 2:
+                # Bring channels in 2nd dim
+                logits = logits.transpose(-1, 1)
+            ret = -torch.nn.functional.cross_entropy(logits, idx, reduce=False)
+        else:
+            ret = super().log_prob(idx)
         # Fill masked values with neg_inf.
         ret = ret.view_as(val_3d)
         ret = ret.masked_fill(
             torch.logical_not(mask.any(dim=-1, keepdim=True)), self.neg_inf
         )
-        return ret.resize_as(value)
+        return ret.view_as(value)
 
     @staticmethod
     def _mask_logits(
diff --git a/torchrl/modules/llm/common.py b/torchrl/modules/llm/common.py
@@ -4,14 +4,12 @@
 # LICENSE file in the root directory of this source tree.
 from __future__ import annotations
 
+import torch
 from tensordict import NestedKey, TensorDictBase
-from tensordict.nn import (
-    ProbabilisticTensorDictModule,
-    TensorDictModuleBase,
-    TensorDictSequential,
-)
+from tensordict.nn import TensorDictModuleBase, TensorDictSequential
 from torch import distributions as D
 from torch.distributions import Categorical
+from torchrl.modules import MaskedCategorical
 
 
 class CategoricalSequential(TensorDictModuleBase):
@@ -21,14 +19,44 @@ class CategoricalSequential(TensorDictModuleBase):
 
     """
 
+    generate: bool
+
     def get_dist(
         self,
         tensordict: TensorDictBase,
         tensordict_out: TensorDictBase | None = None,
+        as_padded_tensor: bool | None = None,
+        as_nested_tensor: bool | None = None,
+        padding_value: float | None = None,
+        padding_side: str = "right",
+        layout: torch.layout | None = None,
         **kwargs,
     ) -> D.Distribution:
         td_out = self(tensordict.copy())
-        return Categorical(td_out.get("logits"))
+        # By default, pad and use masked categorical
+        if as_padded_tensor is None:
+            as_padded_tensor = as_nested_tensor is not True
+            if padding_value is None:
+                padding_value = 0.0
+        if as_nested_tensor is None:
+            as_nested_tensor = False
+        logits = td_out.get(
+            "logits",
+            as_padded_tensor=as_padded_tensor,
+            as_nested_tensor=as_nested_tensor,
+            padding_value=padding_value,
+            padding_side=padding_side,
+            layout=layout,
+        )
+        if as_padded_tensor:
+            # We can use MaskedCategorical
+            dist = MaskedCategorical(
+                logits=logits,
+                mask=logits != padding_value,
+                # use_cross_entropy=True,
+            )
+            return dist
+        return Categorical(logits)
 
     # Sampling is taken care of by the sub-modules
     forward = TensorDictSequential.forward