[Features]: Keep actions and rewards across steps in rollout (#460)

nicolas-dufour · vmoens · vmoens · commit 3da4d6aa91c9 · 2022-09-16T17:29:13.000+01:00
Co-authored-by: vmoens &lt;vincentmoens@gmail.com&gt;
diff --git a/test/mocking_classes.py b/test/mocking_classes.py
@@ -109,15 +109,21 @@ def set_seed(self, seed: int, static_seed: bool = False) -> int:
 
     def _step(self, tensordict):
         self.counter += 1
-        n = torch.tensor([self.counter]).to(self.device).to(torch.get_default_dtype())
+        n = torch.tensor(
+            [self.counter], device=self.device, dtype=torch.get_default_dtype()
+        )
         done = self.counter >= self.max_val
         done = torch.tensor([done], dtype=torch.bool, device=self.device)
-        return TensorDict({"reward": n, "done": done, "next_observation": n}, [])
+        return TensorDict(
+            {"reward": n, "done": done, "next_observation": n.clone()}, []
+        )
 
     def _reset(self, tensordict: TensorDictBase, **kwargs) -> TensorDictBase:
         self.max_val = max(self.counter + 100, self.counter * 2)
 
-        n = torch.tensor([self.counter]).to(self.device).to(torch.get_default_dtype())
+        n = torch.tensor(
+            [self.counter], device=self.device, dtype=torch.get_default_dtype()
+        )
         done = self.counter >= self.max_val
         done = torch.tensor([done], dtype=torch.bool, device=self.device)
         return TensorDict({"done": done, "next_observation": n}, [])
diff --git a/torchrl/envs/common.py b/torchrl/envs/common.py
@@ -229,7 +229,11 @@ def __init__(
 
     @classmethod
     def __new__(cls, *args, _batch_locked=True, **kwargs):
-        cls._inplace_update = True
+        # inplace update will write tensors in-place on the provided tensordict.
+        # This is risky, especially if gradients need to be passed (in-place copy
+        # for tensors that are part of computational graphs will result in an error).
+        # It can also lead to inconsistencies when calling rollout.
+        cls._inplace_update = False
         cls._batch_locked = _batch_locked
         return super().__new__(cls)
 
@@ -552,7 +556,12 @@ def policy(td):
                     break_when_any_done and tensordict.get("done").any()
                 ) or i == max_steps - 1:
                     break
-                tensordict = step_tensordict(tensordict, keep_other=True)
+                tensordict = step_tensordict(
+                    tensordict,
+                    keep_other=True,
+                    exclude_reward=False,
+                    exclude_action=False,
+                )
 
                 if callback is not None:
                     callback(self, tensordict)