[BugFix] Enable ndim done states in GAE with shifted=True

Vincent Moens · Vincent Moens · commit f121f4dcae95 · 2025-05-20T11:01:40.000+01:00
ghstack-source-id: fb9fd48 Pull-Request-resolved: #2962
diff --git a/test/test_cost.py b/test/test_cost.py
@@ -14247,6 +14247,46 @@ def _forward_value_estimator_keys(self, **kwargs) -> None:
 
 
 class TestValues:
+    def test_gae_multi_done(self):
+
+        # constants
+        batch_size = 10
+        seq_size = 5
+        n_dims = batch_size
+        gamma = 0.99
+        lmbda = 0.98
+
+        env = SerialEnv(
+            batch_size, [functools.partial(GymEnv, "CartPole-v1")] * batch_size
+        )
+        obs_size = env.full_observation_spec[env.observation_keys[0]].shape[-1]
+
+        td = env.rollout(seq_size, break_when_any_done=False)
+        # make the magic happen: swap dims and create an artificial ndim done state
+        done = td["next", "done"].transpose(0, -1)
+        terminated = td["next", "terminated"].transpose(0, -1)
+        reward = td["next", "reward"].transpose(0, -1)
+        td = td[:1]
+        td["next", "done"] = done
+        td["next", "terminated"] = terminated
+        td["next", "reward"] = reward
+
+        critic = TensorDictModule(
+            nn.Linear(obs_size, n_dims),
+            in_keys=[("observation",)],
+            out_keys=[("state_value",)],
+        )
+
+        gae_shifted = GAE(gamma=gamma, lmbda=lmbda, value_network=critic, shifted=True)
+        gae_no_shifted = GAE(
+            gamma=gamma, lmbda=lmbda, value_network=critic, shifted=False
+        )
+
+        torch.testing.assert_close(
+            gae_shifted(td.clone())["advantage"],
+            gae_no_shifted(td.clone())["advantage"],
+        )
+
     @pytest.mark.skipif(not _has_gym, reason="requires gym")
     @pytest.mark.parametrize("module", ["lstm", "gru"])
     def test_gae_recurrent(self, module):
diff --git a/torchrl/objectives/value/advantages.py b/torchrl/objectives/value/advantages.py
@@ -460,11 +460,14 @@ def _call_value_nets(
             data_copy = data.copy()
             # we are going to modify the done so let's clone it
             done = data_copy["next", "done"].clone()
-
             # Mark the last step of every sequence as done. We do this because flattening would cause the trajectories
             #  of different batches to be merged.
             done[(slice(None),) * (ndim - 1) + (-1,)].fill_(True)
+            truncated = data_copy.get(("next", "truncated"), done)
+            if truncated is not done:
+                truncated[(slice(None),) * (ndim - 1) + (-1,)].fill_(True)
             data_copy["next", "done"] = done
+            data_copy["next", "truncated"] = truncated
             # Reshape to -1 because we cannot guarantee that all dims have the same number of done states
             with data_copy.view(-1) as data_copy_view:
                 # Interleave next data when done
@@ -482,7 +485,11 @@ def _call_value_nets(
                 #    done = [0, 0, 1, 0, 1, 0, 1]
                 # done_cs = [0, 0, 0, 1, 1, 2, 2]
                 # indices = [0, 1, 2, 4, 5, 7, 8]
-                done_view = data_copy_view["next", "done"].squeeze(-1)
+                done_view = data_copy_view["next", "done"]
+                if done_view.shape[-1] == 1:
+                    done_view = done_view.squeeze(-1)
+                else:
+                    done_view = done_view.any(-1)
                 done_cs = done_view.cumsum(0)
                 done_cs = torch.cat([done_cs.new_zeros((1,)), done_cs[:-1]], dim=0)
                 indices = torch.arange(done_cs.shape[0], device=done_cs.device)