[Feature] ClippedPPOLoss can handle composite value networks (#3031)

louisfaury · Louis Faury · web-flow · commit c8a3eeb53cbc · 2025-07-02T17:38:03.000+01:00
Co-authored-by: Louis Faury &lt;louis.faury@helsing.ai&gt;
diff --git a/test/test_cost.py b/test/test_cost.py
@@ -9865,6 +9865,47 @@ def test_weighted_entropy_mapping_missing_key(self):
         with pytest.raises(KeyError):
             loss._weighted_loss_entropy(entropy)
 
+    def test_critic_loss_tensordict(self):
+        # Creates a dummy actor.
+        actor, _ = self._create_mock_actor_value()
+
+        # Creates a critic that produces a tensordict of values.
+        class CompositeValueNetwork(nn.Module):
+            def forward(self, _) -> tuple[torch.Tensor, torch.Tensor]:
+                return torch.tensor([0.0]), torch.tensor([0.0])
+
+        critic = TensorDictModule(
+            CompositeValueNetwork(),
+            in_keys=["state"],
+            out_keys=[("state_value", "value_0"), ("state_value", "value_1")],
+        )
+
+        # Creates the loss and its input tensordict.
+        loss = ClipPPOLoss(actor, critic, loss_critic_type="l2", clip_value=0.1)
+        td = TensorDict(
+            {
+                "state": torch.tensor([0.0]),
+                "value_target": TensorDict(
+                    {"value_0": torch.tensor([-1.0]), "value_1": torch.tensor([2.0])}
+                ),
+                # Log an existing 'state_value' for the 'clip_fraction'
+                "state_value": TensorDict(
+                    {"value_0": torch.tensor([0.0]), "value_1": torch.tensor([0.0])}
+                ),
+            },
+            batch_size=(1,),
+        )
+
+        critic_loss, clip_fraction, explained_variance = loss.loss_critic(td)
+
+        assert isinstance(critic_loss, TensorDict)
+        assert "value_0" in critic_loss.keys() and "value_1" in critic_loss.keys()
+        torch.testing.assert_close(critic_loss["value_0"], torch.tensor([1.0]))
+        torch.testing.assert_close(critic_loss["value_1"], torch.tensor([4.0]))
+
+        assert isinstance(clip_fraction, TensorDict)
+        assert isinstance(explained_variance, TensorDict)
+
 
 class TestA2C(LossModuleTestBase):
     seed = 0
diff --git a/torchrl/objectives/ppo.py b/torchrl/objectives/ppo.py
@@ -690,7 +690,9 @@ def _log_weight(
 
         return log_weight, dist, kl_approx
 
-    def loss_critic(self, tensordict: TensorDictBase) -> torch.Tensor:
+    def loss_critic(
+        self, tensordict: TensorDictBase
+    ) -> tuple[torch.Tensor | TensorDict, ...]:
         """Returns the critic loss multiplied by ``critic_coef``, if it is not ``None``."""
         # TODO: if the advantage is gathered by forward, this introduces an
         # overhead that we could easily reduce.
@@ -709,28 +711,24 @@ def loss_critic(self, tensordict: TensorDictBase) -> torch.Tensor:
             )
 
         if self.clip_value:
-            old_state_value = tensordict.get(
-                self.tensor_keys.value, None
-            )  # TODO: None soon to be removed
+            old_state_value = tensordict.get(self.tensor_keys.value)
             if old_state_value is None:
                 raise KeyError(
                     f"clip_value is set to {self.clip_value}, but "
                     f"the key {self.tensor_keys.value} was not found in the input tensordict. "
-                    f"Make sure that the value_key passed to PPO exists in the input tensordict."
+                    f"Make sure that the 'value_key' passed to PPO exists in the input tensordict."
                 )
 
         with self.critic_network_params.to_module(
             self.critic_network
         ) if self.functional else contextlib.nullcontext():
             state_value_td = self.critic_network(tensordict)
 
-        state_value = state_value_td.get(
-            self.tensor_keys.value, None
-        )  # TODO: None soon to be removed
+        state_value = state_value_td.get(self.tensor_keys.value)
         if state_value is None:
             raise KeyError(
                 f"the key {self.tensor_keys.value} was not found in the critic output tensordict. "
-                f"Make sure that the value_key passed to PPO is accurate."
+                f"Make sure that the 'value_key' passed to PPO is accurate."
             )
 
         loss_value = distance_loss(
@@ -756,8 +754,9 @@ def loss_critic(self, tensordict: TensorDictBase) -> torch.Tensor:
                 tgt = target_return.detach()
                 pred = state_value.detach()
                 eps = torch.finfo(tgt.dtype).eps
-                resid = torch.var(tgt - pred, unbiased=False, dim=0)
-                total = torch.var(tgt, unbiased=False, dim=0)
+
+                resid = torch.var(tgt - pred, correction=0, dim=0)
+                total = torch.var(tgt, correction=0, dim=0)
                 explained_variance = 1.0 - resid / (total + eps)
 
         self._clear_weakrefs(
@@ -954,7 +953,7 @@ class ClipPPOLoss(PPOLoss):
             ``samples_mc_entropy`` will control how many
             samples will be used to compute this estimate.
             Defaults to ``1``.
-        entropy_coeff: scalar | Mapping[str, scalar], optional): entropy multiplier when computing the total loss.
+        entropy_coeff: (scalar | Mapping[str, scalar], optional): entropy multiplier when computing the total loss.
             * **Scalar**: one value applied to the summed entropy of every action head.
             * **Mapping** ``{head_name: coef}`` gives an individual coefficient for each action-head's entropy.
             Defaults to ``0.01``.
diff --git a/torchrl/objectives/utils.py b/torchrl/objectives/utils.py
@@ -9,7 +9,7 @@
 import warnings
 from copy import copy
 from enum import Enum
-from typing import Any, Callable, Iterable
+from typing import Any, Callable, Iterable, TypeVar
 
 import torch
 from tensordict import NestedKey, TensorDict, TensorDictBase, unravel_key
@@ -101,54 +101,44 @@ def decorate_context(*args, **kwargs):
         return decorate_context
 
 
+TensorLike = TypeVar("TensorLike", Tensor, TensorDict)
+
+
 def distance_loss(
-    v1: torch.Tensor,
-    v2: torch.Tensor,
+    v1: TensorLike,
+    v2: TensorLike,
     loss_function: str,
     strict_shape: bool = True,
-) -> torch.Tensor:
+) -> TensorLike:
     """Computes a distance loss between two tensors.
 
     Args:
-        v1 (Tensor): a tensor with a shape compatible with v2
-        v2 (Tensor): a tensor with a shape compatible with v1
+        v1 (Tensor | TensorDict): a tensor or tensordict with a shape compatible with v2.
+        v2 (Tensor | TensorDict): a tensor or tensordict with a shape compatible with v1.
         loss_function (str): One of "l2", "l1" or "smooth_l1" representing which loss function is to be used.
         strict_shape (bool): if False, v1 and v2 are allowed to have a different shape.
             Default is ``True``.
 
     Returns:
-         A tensor of the shape v1.view_as(v2) or v2.view_as(v1) with values equal to the distance loss between the
-        two.
+         A tensor or tensordict of the shape v1.view_as(v2) or v2.view_as(v1)
+        with values equal to the distance loss between the two.
 
     """
     if v1.shape != v2.shape and strict_shape:
         raise RuntimeError(
-            f"The input tensors have shapes {v1.shape} and {v2.shape} which are incompatible."
+            f"The input tensors or tensordicts have shapes {v1.shape} and {v2.shape} which are incompatible."
         )
 
     if loss_function == "l2":
-        value_loss = F.mse_loss(
-            v1,
-            v2,
-            reduction="none",
-        )
+        return F.mse_loss(v1, v2, reduction="none")
 
-    elif loss_function == "l1":
-        value_loss = F.l1_loss(
-            v1,
-            v2,
-            reduction="none",
-        )
+    if loss_function == "l1":
+        return F.l1_loss(v1, v2, reduction="none")
 
-    elif loss_function == "smooth_l1":
-        value_loss = F.smooth_l1_loss(
-            v1,
-            v2,
-            reduction="none",
-        )
-    else:
-        raise NotImplementedError(f"Unknown loss {loss_function}")
-    return value_loss
+    if loss_function == "smooth_l1":
+        return F.smooth_l1_loss(v1, v2, reduction="none")
+
+    raise NotImplementedError(f"Unknown loss {loss_function}.")
 
 
 class TargetNetUpdater:
@@ -620,13 +610,13 @@ def _reduce(tensor: torch.Tensor, reduction: str) -> float | torch.Tensor:
 
 
 def _clip_value_loss(
-    old_state_value: torch.Tensor,
-    state_value: torch.Tensor,
-    clip_value: torch.Tensor,
-    target_return: torch.Tensor,
-    loss_value: torch.Tensor,
+    old_state_value: torch.Tensor | TensorDict,
+    state_value: torch.Tensor | TensorDict,
+    clip_value: torch.Tensor | TensorDict,
+    target_return: torch.Tensor | TensorDict,
+    loss_value: torch.Tensor | TensorDict,
     loss_critic_type: str,
-):
+) -> tuple[torch.Tensor | TensorDict, torch.Tensor]:
     """Value clipping method for loss computation.
 
     This method computes a clipped state value from the old state value and the state value,
@@ -644,7 +634,7 @@ def _clip_value_loss(
         loss_function=loss_critic_type,
     )
     # Chose the most pessimistic value prediction between clipped and non-clipped
-    loss_value = torch.max(loss_value, loss_value_clipped)
+    loss_value = torch.maximum(loss_value, loss_value_clipped)
     return loss_value, clip_fraction