[BugFix] Step and maybe reset (#938)

vmoens · web-flow · commit e1620eb0998e · 2023-02-27T10:45:35.000Z
diff --git a/test/mocking_classes.py b/test/mocking_classes.py
@@ -97,6 +97,8 @@ def custom_td(self):
 
 
 class MockSerialEnv(EnvBase):
+    """A simple counting env that is reset after a predifined max number of steps."""
+
     @classmethod
     def __new__(
         cls,
@@ -844,9 +846,16 @@ def forward(self, observation, action):
 
 
 class CountingEnv(EnvBase):
-    def __init__(self, max_steps: int = 5, **kwargs):
+    """An env that is done after a given number of steps.
+
+    The action is the count increment.
+
+    """
+
+    def __init__(self, max_steps: int = 5, start_val: int = 0, **kwargs):
         super().__init__(**kwargs)
         self.max_steps = max_steps
+        self.start_val = start_val
 
         self.observation_spec = CompositeSpec(
             observation=UnboundedContinuousTensorSpec(
@@ -878,9 +887,9 @@ def _set_seed(self, seed: Optional[int]):
     def _reset(self, tensordict: TensorDictBase, **kwargs) -> TensorDictBase:
         if tensordict is not None and "_reset" in tensordict.keys():
             _reset = tensordict.get("_reset")
-            self.count[_reset] = 0
+            self.count[_reset] = self.start_val
         else:
-            self.count[:] = 0
+            self.count[:] = self.start_val
         return TensorDict(
             source={
                 "observation": self.count.clone(),
@@ -905,3 +914,87 @@ def _step(
             batch_size=self.batch_size,
             device=self.device,
         )
+
+
+class CountingBatchedEnv(EnvBase):
+    """An env that is done after a given number of steps.
+
+    The action is the count increment.
+
+    Unlike ``CountingEnv``, different envs of the batch can have different max_steps
+    """
+
+    def __init__(
+        self,
+        max_steps: torch.Tensor = None,
+        start_val: torch.Tensor = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        if max_steps is None:
+            max_steps = torch.tensor(5)
+        if start_val is None:
+            start_val = torch.zeros(())
+        if not max_steps.shape == self.batch_size:
+            raise RuntimeError("batch_size and max_steps shape must match.")
+
+        self.max_steps = max_steps
+        self.start_val = start_val
+
+        self.observation_spec = CompositeSpec(
+            observation=UnboundedContinuousTensorSpec(
+                (
+                    *self.batch_size,
+                    1,
+                )
+            ),
+            shape=self.batch_size,
+        )
+        self.reward_spec = UnboundedContinuousTensorSpec(
+            (
+                *self.batch_size,
+                1,
+            )
+        )
+        self.input_spec = CompositeSpec(
+            action=BinaryDiscreteTensorSpec(n=1, shape=[*self.batch_size, 1]),
+            shape=self.batch_size,
+        )
+
+        self.count = torch.zeros(
+            (*self.batch_size, 1), device=self.device, dtype=torch.int
+        )
+
+    def _set_seed(self, seed: Optional[int]):
+        torch.manual_seed(seed)
+
+    def _reset(self, tensordict: TensorDictBase, **kwargs) -> TensorDictBase:
+        if tensordict is not None and "_reset" in tensordict.keys():
+            _reset = tensordict.get("_reset")
+            self.count[_reset] = self.start_val[_reset].unsqueeze(-1)
+        else:
+            self.count[:] = self.start_val.unsqueeze(-1)
+        return TensorDict(
+            source={
+                "observation": self.count.clone(),
+                "done": self.count > self.max_steps.unsqueeze(-1),
+            },
+            batch_size=self.batch_size,
+            device=self.device,
+        )
+
+    def _step(
+        self,
+        tensordict: TensorDictBase,
+    ) -> TensorDictBase:
+        action = tensordict.get("action")
+        self.count += action.to(torch.int).unsqueeze(-1)
+        return TensorDict(
+            source={
+                "observation": self.count,
+                "done": self.count > self.max_steps.unsqueeze(-1),
+                "reward": torch.zeros_like(self.count, dtype=torch.float),
+            },
+            batch_size=self.batch_size,
+            device=self.device,
+        )
diff --git a/test/test_collector.py b/test/test_collector.py
@@ -12,6 +12,8 @@
 from _utils_internal import generate_seeds, PENDULUM_VERSIONED, PONG_VERSIONED
 from mocking_classes import (
     ContinuousActionVecMockEnv,
+    CountingBatchedEnv,
+    CountingEnv,
     DiscreteActionConvMockEnv,
     DiscreteActionConvPolicy,
     DiscreteActionVecMockEnv,
@@ -1181,6 +1183,52 @@ def test_auto_wrap_error(self, collector_class, env_maker):
             )
 
 
+@pytest.mark.parametrize("env_class", [CountingEnv, CountingBatchedEnv])
+def test_initial_obs_consistency(env_class, seed=1):
+    torch.manual_seed(seed)
+    start_val = 4
+    if env_class == CountingEnv:
+        num_envs = 1
+        env = CountingEnv(device="cpu", max_steps=8, start_val=start_val)
+        max_steps = 8
+    elif env_class == CountingBatchedEnv:
+        num_envs = 2
+        env = CountingBatchedEnv(
+            device="cpu",
+            batch_size=[num_envs],
+            max_steps=torch.arange(num_envs) + 17,
+            start_val=torch.ones([num_envs]) * start_val,
+        )
+        max_steps = env.max_steps.max().item()
+    env.set_seed(seed)
+    policy = lambda tensordict: tensordict.set(
+        "action", torch.ones(tensordict.shape, dtype=torch.int)
+    )
+    collector = SyncDataCollector(
+        create_env_fn=env,
+        policy=policy,
+        frames_per_batch=((max_steps - 3) * 2 + 2) * num_envs,  # at least two episodes
+        split_trajs=False,
+    )
+    for _d in collector:
+        break
+    obs = _d["observation"].squeeze()
+    if env_class == CountingEnv:
+        arange_0 = start_val + torch.arange(max_steps - 3)
+        arange = start_val + torch.arange(2)
+        expected = torch.cat([arange_0, arange_0, arange]).float()
+    else:
+        # the first env has a shorter horizon than the second
+        arange_0 = start_val + torch.arange(max_steps - 3 - 1)
+        arange = start_val + torch.arange(start_val)
+        expected_0 = torch.cat([arange_0, arange_0, arange]).float()
+        arange_0 = start_val + torch.arange(max_steps - 3)
+        arange = start_val + torch.arange(2)
+        expected_1 = torch.cat([arange_0, arange_0, arange]).float()
+        expected = torch.stack([expected_0, expected_1])
+    assert torch.allclose(obs, expected)
+
+
 def weight_reset(m):
     if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
         m.reset_parameters()
diff --git a/torchrl/collectors/collectors.py b/torchrl/collectors/collectors.py
@@ -571,7 +571,7 @@ def iterator(self) -> Iterator[TensorDictBase]:
             if self._frames >= self.total_frames:
                 break
 
-    def _reset_if_necessary(self) -> None:
+    def _step_and_maybe_reset(self) -> None:
         done = self._tensordict.get("done")
         if not self.reset_when_done:
             done = torch.zeros_like(done)
@@ -592,6 +592,11 @@ def _reset_if_necessary(self) -> None:
             done_or_terminated = done_or_terminated | _reset
 
         if done_or_terminated.any():
+            if not done_or_terminated.all():
+                self._tensordict[~done_or_terminated] = step_mdp(
+                    self._tensordict[~done_or_terminated]
+                )
+
             traj_ids = self._tensordict.get(("collector", "traj_ids")).clone()
             steps = steps.clone()
             if len(self.env.batch_size):
@@ -617,6 +622,8 @@ def _reset_if_necessary(self) -> None:
                 ("collector", "traj_ids"), traj_ids
             )  # no ops if they already match
             self._tensordict.set_(("collector", "step_count"), steps)
+        else:
+            self._tensordict.update(step_mdp(self._tensordict), inplace=True)
 
     @torch.no_grad()
     def rollout(self) -> TensorDictBase:
@@ -651,8 +658,7 @@ def rollout(self) -> TensorDictBase:
                     if is_shared:
                         self._tensordict_out.share_memory_()
 
-                self._reset_if_necessary()
-                self._tensordict.update(step_mdp(self._tensordict), inplace=True)
+                self._step_and_maybe_reset()
 
         return self._tensordict_out