[BugFix] Improve collector buffer initialisation when policy spec is unavailable (#1547)

matteobettini · vmoens · web-flow · commit a02679bf2cfc · 2023-10-01T20:44:32.000+01:00
Signed-off-by: Matteo Bettini &lt;matbet@meta.com&gt;
Co-authored-by: vmoens &lt;vincentmoens@gmail.com&gt;
diff --git a/test/test_collector.py b/test/test_collector.py
@@ -1397,6 +1397,26 @@ def test_reset_heterogeneous_envs():
     ).all()
 
 
+def test_policy_with_mask():
+    env = CountingBatchedEnv(start_val=torch.tensor(10), max_steps=torch.tensor(1e5))
+
+    def policy(td):
+        obs = td.get("observation")
+        # This policy cannot work with obs all 0s
+        if not obs.any():
+            raise AssertionError
+        action = obs.clone()
+        td.set("action", action)
+        return td
+
+    collector = SyncDataCollector(
+        env, policy=policy, frames_per_batch=10, total_frames=20
+    )
+    for _ in collector:
+        break
+    collector.shutdown()
+
+
 class TestNestedEnvsCollector:
     def test_multi_collector_nested_env_consistency(self, seed=1):
         env = NestedCountingEnv()
diff --git a/torchrl/collectors/collectors.py b/torchrl/collectors/collectors.py
@@ -621,64 +621,45 @@ def __init__(
         )
 
         with torch.no_grad():
-            self._tensordict_out = env.fake_tensordict()
+            self._tensordict_out = self.env.fake_tensordict()
+        # If the policy has a valid spec, we use it
         if (
-            hasattr(self.policy, "spec")
-            and self.policy.spec is not None
-            and all(
-                v is not None for v in self.policy.spec.values(True, True)
-            )  # if a spec is None, we don't know anything about it
-            # and set(self.policy.spec.keys(True, True)) == set(self.policy.out_keys)
-            and any(
-                key not in self._tensordict_out.keys(isinstance(key, tuple))
-                for key in self.policy.spec.keys(True, True)
-            )
-        ):
-            # if policy spec is non-empty, all the values are not None and the keys
-            # match the out_keys we assume the user has given all relevant information
-            # the policy could have more keys than the env:
-            policy_spec = self.policy.spec
-            if policy_spec.ndim < self._tensordict_out.ndim:
-                policy_spec = policy_spec.expand(self._tensordict_out.shape)
-            for key, spec in policy_spec.items(True, True):
-                if key in self._tensordict_out.keys(isinstance(key, tuple)):
-                    continue
-                self._tensordict_out.set(key, spec.zero())
-            self._tensordict_out = (
-                self._tensordict_out.unsqueeze(-1)
-                .expand(*env.batch_size, self.frames_per_batch)
-                .clone()
-            )
-        elif (
             hasattr(self.policy, "spec")
             and self.policy.spec is not None
             and all(v is not None for v in self.policy.spec.values(True, True))
-            and all(
-                key in self._tensordict_out.keys(isinstance(key, tuple))
-                for key in self.policy.spec.keys(True, True)
-            )
         ):
-            # reach this if the policy has specs and they match with the fake tensordict
-            self._tensordict_out = (
-                self._tensordict_out.unsqueeze(-1)
-                .expand(*env.batch_size, self.frames_per_batch)
-                .clone()
-            )
+            if any(
+                key not in self._tensordict_out.keys(isinstance(key, tuple))
+                for key in self.policy.spec.keys(True, True)
+            ):
+                # if policy spec is non-empty, all the values are not None and the keys
+                # match the out_keys we assume the user has given all relevant information
+                # the policy could have more keys than the env:
+                policy_spec = self.policy.spec
+                if policy_spec.ndim < self._tensordict_out.ndim:
+                    policy_spec = policy_spec.expand(self._tensordict_out.shape)
+                for key, spec in policy_spec.items(True, True):
+                    if key in self._tensordict_out.keys(isinstance(key, tuple)):
+                        continue
+                    self._tensordict_out.set(key, spec.zero())
+
         else:
             # otherwise, we perform a small number of steps with the policy to
             # determine the relevant keys with which to pre-populate _tensordict_out.
             # This is the safest thing to do if the spec has None fields or if there is
             # no spec at all.
             # See #505 for additional context.
+            self._tensordict_out.update(self._tensordict)
             with torch.no_grad():
-                self._tensordict_out = self._tensordict_out.to(self.device)
-                self._tensordict_out = self.policy(self._tensordict_out).unsqueeze(-1)
-            self._tensordict_out = (
-                self._tensordict_out.expand(*env.batch_size, self.frames_per_batch)
-                .clone()
-                .zero_()
-            )
-        # in addition to outputs of the policy, we add traj_ids and step_count to
+                self._tensordict_out = self.policy(self._tensordict_out.to(self.device))
+
+        self._tensordict_out = (
+            self._tensordict_out.unsqueeze(-1)
+            .expand(*env.batch_size, self.frames_per_batch)
+            .clone()
+            .zero_()
+        )
+        # in addition to outputs of the policy, we add traj_ids to
         # _tensordict_out which will be collected during rollout
         self._tensordict_out = self._tensordict_out.to(self.storing_device)
         self._tensordict_out.set(