[BugFix] Better account of composite distributions in PPO

Vincent Moens · Vincent Moens · commit 90c8e40f64bb · 2024-12-02T17:44:36.000Z
ghstack-source-id: 3d86f99 Pull Request resolved: #2622
diff --git a/torchrl/objectives/ppo.py b/torchrl/objectives/ppo.py
@@ -18,6 +18,7 @@
     TensorDictParams,
 )
 from tensordict.nn import (
+    CompositeDistribution,
     dispatch,
     ProbabilisticTensorDictModule,
     ProbabilisticTensorDictSequential,
@@ -33,6 +34,7 @@
     _clip_value_loss,
     _GAMMA_LMBDA_DEPREC_ERROR,
     _reduce,
+    _sum_td_features,
     default_value_kwargs,
     distance_loss,
     ValueEstimators,
@@ -462,9 +464,13 @@ def reset(self) -> None:
 
     def get_entropy_bonus(self, dist: d.Distribution) -> torch.Tensor:
         try:
-            entropy = dist.entropy()
+            if isinstance(dist, CompositeDistribution):
+                kwargs = {"aggregate_probabilities": False, "include_sum": False}
+            else:
+                kwargs = {}
+            entropy = dist.entropy(**kwargs)
             if is_tensor_collection(entropy):
-                entropy = entropy.get(dist.entropy_key)
+                entropy = _sum_td_features(entropy)
         except NotImplementedError:
             x = dist.rsample((self.samples_mc_entropy,))
             log_prob = dist.log_prob(x)
@@ -497,13 +503,20 @@ def _log_weight(
         if isinstance(action, torch.Tensor):
             log_prob = dist.log_prob(action)
         else:
-            maybe_log_prob = dist.log_prob(tensordict)
-            if not isinstance(maybe_log_prob, torch.Tensor):
-                # In some cases (Composite distribution with aggregate_probabilities toggled off) the returned type may not
-                # be a tensor
-                log_prob = maybe_log_prob.get(self.tensor_keys.sample_log_prob)
+            if isinstance(dist, CompositeDistribution):
+                is_composite = True
+                kwargs = {
+                    "inplace": False,
+                    "aggregate_probabilities": False,
+                    "include_sum": False,
+                }
             else:
-                log_prob = maybe_log_prob
+                is_composite = False
+                kwargs = {}
+            log_prob = dist.log_prob(tensordict, **kwargs)
+            if is_composite and not isinstance(prev_log_prob, TensorDict):
+                log_prob = _sum_td_features(log_prob)
+                log_prob.view_as(prev_log_prob)
 
         log_weight = (log_prob - prev_log_prob).unsqueeze(-1)
         kl_approx = (prev_log_prob - log_prob).unsqueeze(-1)
@@ -598,6 +611,9 @@ def forward(self, tensordict: TensorDictBase) -> TensorDictBase:
             advantage = (advantage - loc) / scale
 
         log_weight, dist, kl_approx = self._log_weight(tensordict)
+        if is_tensor_collection(log_weight):
+            log_weight = _sum_td_features(log_weight)
+            log_weight = log_weight.view(advantage.shape)
         neg_loss = log_weight.exp() * advantage
         td_out = TensorDict({"loss_objective": -neg_loss}, batch_size=[])
         if self.entropy_bonus:
@@ -1149,16 +1165,19 @@ def forward(self, tensordict: TensorDictBase) -> TensorDict:
             kl = torch.distributions.kl.kl_divergence(previous_dist, current_dist)
         except NotImplementedError:
             x = previous_dist.sample((self.samples_mc_kl,))
-            previous_log_prob = previous_dist.log_prob(x)
-            current_log_prob = current_dist.log_prob(x)
+            if isinstance(previous_dist, CompositeDistribution):
+                kwargs = {
+                    "aggregate_probabilities": False,
+                    "inplace": False,
+                    "include_sum": False,
+                }
+            else:
+                kwargs = {}
+            previous_log_prob = previous_dist.log_prob(x, **kwargs)
+            current_log_prob = current_dist.log_prob(x, **kwargs)
             if is_tensor_collection(current_log_prob):
-                previous_log_prob = previous_log_prob.get(
-                    self.tensor_keys.sample_log_prob
-                )
-                current_log_prob = current_log_prob.get(
-                    self.tensor_keys.sample_log_prob
-                )
-
+                previous_log_prob = _sum_td_features(previous_log_prob)
+                current_log_prob = _sum_td_features(current_log_prob)
             kl = (previous_log_prob - current_log_prob).mean(0)
         kl = kl.unsqueeze(-1)
         neg_loss = neg_loss - self.beta * kl
diff --git a/torchrl/objectives/utils.py b/torchrl/objectives/utils.py
@@ -615,3 +615,8 @@ def group_optimizers(*optimizers: torch.optim.Optimizer) -> torch.optim.Optimize
             raise ValueError("Cannot group optimizers of different type.")
         params.extend(optimizer.param_groups)
     return cls(params)
+
+
+def _sum_td_features(data: TensorDictBase) -> torch.Tensor:
+    # Sum all features and return a tensor
+    return data.sum(dim="feature", reduce=True)
diff --git a/torchrl/objectives/value/advantages.py b/torchrl/objectives/value/advantages.py
@@ -15,11 +15,14 @@
 import torch
 from tensordict import TensorDictBase
 from tensordict.nn import (
+    CompositeDistribution,
     dispatch,
+    ProbabilisticTensorDictModule,
     set_skip_existing,
     TensorDictModule,
     TensorDictModuleBase,
 )
+from tensordict.nn.probabilistic import interaction_type
 from tensordict.utils import NestedKey
 from torch import Tensor
 
@@ -74,14 +77,22 @@ def new_func(self, *args, **kwargs):
 
 
 def _call_actor_net(
-    actor_net: TensorDictModuleBase,
+    actor_net: ProbabilisticTensorDictModule,
     data: TensorDictBase,
     params: TensorDictBase,
     log_prob_key: NestedKey,
 ):
-    # TODO: extend to handle time dimension (and vmap?)
-    log_pi = actor_net(data.select(*actor_net.in_keys, strict=False)).get(log_prob_key)
-    return log_pi
+    dist = actor_net.get_dist(data.select(*actor_net.in_keys, strict=False))
+    if isinstance(dist, CompositeDistribution):
+        kwargs = {
+            "aggregate_probabilities": True,
+            "inplace": False,
+            "include_sum": False,
+        }
+    else:
+        kwargs = {}
+    s = actor_net._dist_sample(dist, interaction_type=interaction_type())
+    return dist.log_prob(s, **kwargs)
 
 
 class ValueEstimatorBase(TensorDictModuleBase):
@@ -1771,7 +1782,8 @@ def forward(
                 data=tensordict,
                 params=None,
                 log_prob_key=self.tensor_keys.sample_log_prob,
-            ).view_as(value)
+            )
+            log_pi = log_pi.view_as(value)
 
         # Compute the V-Trace correction
         done = tensordict.get(("next", self.tensor_keys.done))