pytorch
diff --git a/‎docs/source/reference/data.rst
Lines changed: 6 additions & 5 deletions b/‎docs/source/reference/data.rst
Lines changed: 6 additions & 5 deletions
diff --git a/‎test/test_env.py
Lines changed: 265 additions & 24 deletions b/‎test/test_env.py
Lines changed: 265 additions & 24 deletions
@@ -1144,16 +1144,17 @@ Utils
     :toctree: generated/
     :template: rl_template.rst
 
-    MultiStep
-    consolidate_spec
-    check_no_exclusive_keys
-    contains_lazy_spec
-    Nested2TED
+    DensifyReward
     Flat2TED
     H5Combine
     H5Split
+    MultiStep
+    Nested2TED
     TED2Flat
     TED2Nested
+    check_no_exclusive_keys
+    consolidate_spec
+    contains_lazy_spec
 
 .. currentmodule:: torchrl.envs.transforms.rb_transforms
 
 
@@ -4616,11 +4616,13 @@ def __next__(self):
     @pytest.mark.parametrize("batch_size", [0, 4])
     @pytest.mark.parametrize("device", [None, "cpu"])
     def test_llm_env(self, str2str, batched, stack_method, device, batch_size):
-        env = LLMEnv(str2str=str2str, device=device)
+        env = LLMEnv(
+            str2str=str2str, device=device, has_attention=False, no_stack=False
+        )
         if str2str:
             primer = DataLoadingPrimer(
                 dataloader=self.DummyDataLoader(batch_size=batch_size),
-                data_keys=["observation"],
+                data_keys=[LLMEnv._DEFAULT_STR_KEY],
                 example_data="a string!",
             )
         else:
@@ -4630,7 +4632,7 @@ def test_llm_env(self, str2str, batched, stack_method, device, batch_size):
                 dataloader=self.DummyTensorDataLoader(
                     batch_size=batch_size, padding=True
                 ),
-                data_keys=["observation"],
+                data_keys=[LLMEnv._DEFAULT_TOKEN_KEY],
                 data_specs=[Unbounded(shape=(-1,), dtype=torch.int64)],
                 stack_method=stack_method,
             )
@@ -4640,7 +4642,7 @@ def test_llm_env(self, str2str, batched, stack_method, device, batch_size):
         if batched:
             td = env.reset(TensorDict(batch_size=[3]))
             env.check_env_specs(break_when_any_done="both", tensordict=td)
-            r = env.rollout(10, tensordict=TensorDict(batch_size=[3]))
+            env.rollout(10, tensordict=TensorDict(batch_size=[3]))
         else:
             env.check_env_specs(break_when_any_done="both")
 
@@ -4663,7 +4665,7 @@ def test_llm_from_dataloader(
         if str2str:
             kwargs = {
                 "dataloader": self.DummyDataLoader(batch_size=batch_size),
-                "data_keys": ["observation"],
+                "data_keys": [LLMEnv._DEFAULT_STR_KEY],
                 "example_data": "a string!",
             }
         else:
@@ -4673,11 +4675,18 @@ def test_llm_from_dataloader(
                 "dataloader": self.DummyTensorDataLoader(
                     padding=True, batch_size=batch_size
                 ),
-                "data_keys": ["observation"],
+                "data_keys": [LLMEnv._DEFAULT_TOKEN_KEY],
                 "data_specs": [Unbounded(shape=(-1,), dtype=torch.int64)],
                 "stack_method": stack_method,
             }
-        kwargs.update({"str2str": str2str, "device": device})
+        kwargs.update(
+            {
+                "str2str": str2str,
+                "device": device,
+                "has_attention": False,
+                "no_stack": False,
+            }
+        )
         env = LLMEnv.from_dataloader(**kwargs)
         assert not env.batch_locked
         if batched:
@@ -4690,51 +4699,283 @@ def test_llm_from_dataloader(
             def policy(td):
                 if str2str:
                     if not td.shape:
-                        td["action"] = "<nothing>"
+                        td[LLMEnv._DEFAULT_ACTION_KEY] = "<nothing>"
                     else:
-                        td["action"] = NonTensorStack(
+                        td[LLMEnv._DEFAULT_ACTION_KEY] = NonTensorStack(
                             *["<nothing>" for _ in range(td.shape[0])]
                         )
                 else:
-                    td["action"] = torch.ones(td.shape + (1,), dtype=torch.int64)
+                    td[LLMEnv._DEFAULT_ACTION_KEY] = torch.ones(
+                        td.shape + (1,), dtype=torch.int64
+                    )
                 return td
 
             if batched:
                 # Tell the env that we want 3 sub-envs
                 r = env.rollout(10, policy, tensordict=TensorDict(batch_size=[3]))
                 assert r.ndim == 2
                 if str2str:
-                    assert isinstance(r[0, 0]["observation"], str)
-                    assert isinstance(r[0, 1]["observation"], str)
+                    assert isinstance(r[0, 0][LLMEnv._DEFAULT_STR_KEY], str)
+                    assert isinstance(r[0, 1][LLMEnv._DEFAULT_STR_KEY], str)
                     assert (
-                        r[0, 0]["observation"]
-                        == r[0, 1]["observation"][: -len(r[0, 0]["action"])]
+                        r[0, 0][LLMEnv._DEFAULT_STR_KEY]
+                        == r[0, 1][LLMEnv._DEFAULT_STR_KEY][
+                            : -len(r[0, 0][LLMEnv._DEFAULT_ACTION_KEY])
+                        ]
                     )
                     assert (
-                        r[0, 1]["observation"]
-                        == r[0, 2]["observation"][: -len(r[0, 1]["action"])]
+                        r[0, 1][LLMEnv._DEFAULT_STR_KEY]
+                        == r[0, 2][LLMEnv._DEFAULT_STR_KEY][
+                            : -len(r[0, 1][LLMEnv._DEFAULT_ACTION_KEY])
+                        ]
                     )
                     assert (
-                        r[-1, 0]["observation"]
-                        == r[-1, 1]["observation"][: -len(r[-1, 0]["action"])]
+                        r[-1, 0][LLMEnv._DEFAULT_STR_KEY]
+                        == r[-1, 1][LLMEnv._DEFAULT_STR_KEY][
+                            : -len(r[-1, 0][LLMEnv._DEFAULT_ACTION_KEY])
+                        ]
                     )
                     assert (
-                        r[-1, 1]["observation"]
-                        == r[-1, 2]["observation"][: -len(r[-1, 1]["action"])]
+                        r[-1, 1][LLMEnv._DEFAULT_STR_KEY]
+                        == r[-1, 2][LLMEnv._DEFAULT_STR_KEY][
+                            : -len(r[-1, 1][LLMEnv._DEFAULT_ACTION_KEY])
+                        ]
                     )
                 else:
-                    assert (r[0, 0]["observation"] == r[0, 1]["observation"][:-1]).all()
-                    assert (r[0, 1]["observation"] == r[0, 2]["observation"][:-1]).all()
                     assert (
-                        r[-1, 0]["observation"] == r[-1, 1]["observation"][:-1]
+                        r[0, 0][LLMEnv._DEFAULT_TOKEN_KEY]
+                        == r[0, 1][LLMEnv._DEFAULT_TOKEN_KEY][:-1]
+                    ).all()
+                    assert (
+                        r[0, 1][LLMEnv._DEFAULT_TOKEN_KEY]
+                        == r[0, 2][LLMEnv._DEFAULT_TOKEN_KEY][:-1]
                     ).all()
                     assert (
-                        r[-1, 1]["observation"] == r[-1, 2]["observation"][:-1]
+                        r[-1, 0][LLMEnv._DEFAULT_TOKEN_KEY]
+                        == r[-1, 1][LLMEnv._DEFAULT_TOKEN_KEY][:-1]
+                    ).all()
+                    assert (
+                        r[-1, 1][LLMEnv._DEFAULT_TOKEN_KEY]
+                        == r[-1, 2][LLMEnv._DEFAULT_TOKEN_KEY][:-1]
                     ).all()
             else:
                 r = env.rollout(10, policy, tensordict=TensorDict(batch_size=[]))
                 assert r.ndim == 1
 
+    @pytest.mark.parametrize(
+        "str2str,stack_method",
+        [
+            [True, None],
+            [False, "as_padded_tensor"],
+            # TODO: a bit experimental, fails with check_env_specs
+            # [False, "as_nested_tensor"],
+            [False, None],
+        ],
+    )
+    @pytest.mark.parametrize("batched", [True, False])
+    @pytest.mark.parametrize("device", [None, "cpu"])
+    @pytest.mark.parametrize("batch_size", [0, 4])
+    @pytest.mark.parametrize("repeats", [3])
+    def test_llm_from_dataloader_repeats(
+        self, str2str, batched, stack_method, device, batch_size, repeats
+    ):
+        if str2str:
+            kwargs = {
+                "dataloader": self.DummyDataLoader(batch_size=batch_size),
+                "data_keys": [LLMEnv._DEFAULT_STR_KEY],
+                "example_data": "a string!",
+                "repeats": repeats,
+            }
+        else:
+            if stack_method is None:
+                stack_method = as_padded_tensor
+            kwargs = {
+                "dataloader": self.DummyTensorDataLoader(
+                    padding=True, batch_size=batch_size
+                ),
+                "data_keys": [LLMEnv._DEFAULT_TOKEN_KEY],
+                "data_specs": [Unbounded(shape=(-1,), dtype=torch.int64)],
+                "stack_method": stack_method,
+                "repeats": repeats,
+            }
+        kwargs.update(
+            {
+                "str2str": str2str,
+                "device": device,
+                "has_attention": False,
+                "no_stack": False,
+            }
+        )
+        env = LLMEnv.from_dataloader(**kwargs)
+        assert env.transform.repeats == repeats
+
+        max_steps = 3
+        env.append_transform(StepCounter(max_steps=max_steps))
+
+        def policy(td):
+            if str2str:
+                if not td.shape:
+                    td[LLMEnv._DEFAULT_ACTION_KEY] = "<nothing>"
+                else:
+                    td[LLMEnv._DEFAULT_ACTION_KEY] = NonTensorStack(
+                        *["<nothing>" for _ in range(td.shape[0])]
+                    )
+            else:
+                td[LLMEnv._DEFAULT_ACTION_KEY] = torch.ones(
+                    td.shape + (1,), dtype=torch.int64
+                )
+            return td
+
+        if batched:
+            r = env.rollout(
+                100,
+                policy,
+                tensordict=TensorDict(batch_size=[3]),
+                break_when_any_done=False,
+            )
+        else:
+            r = env.rollout(100, policy, break_when_any_done=False)
+        # check that r at reset is always the same
+        r_reset = r[..., ::max_steps]
+        if not batched:
+            if str2str:
+                assert (
+                    r_reset[..., 0][LLMEnv._DEFAULT_STR_KEY]
+                    == r_reset[..., 1][LLMEnv._DEFAULT_STR_KEY]
+                )
+                assert (
+                    r_reset[..., 0][LLMEnv._DEFAULT_STR_KEY]
+                    == r_reset[..., 2][LLMEnv._DEFAULT_STR_KEY]
+                )
+                assert (
+                    r_reset[..., 0][LLMEnv._DEFAULT_STR_KEY]
+                    != r_reset[..., 3][LLMEnv._DEFAULT_STR_KEY]
+                )
+            else:
+                assert (
+                    r_reset[..., 0][LLMEnv._DEFAULT_TOKEN_KEY]
+                    == r_reset[..., 1][LLMEnv._DEFAULT_TOKEN_KEY]
+                ).all()
+                assert (
+                    r_reset[..., 0][LLMEnv._DEFAULT_TOKEN_KEY]
+                    == r_reset[..., 2][LLMEnv._DEFAULT_TOKEN_KEY]
+                ).all()
+                assert (
+                    r_reset[..., 0][LLMEnv._DEFAULT_TOKEN_KEY]
+                    != r_reset[..., 3][LLMEnv._DEFAULT_TOKEN_KEY]
+                ).any()
+        else:
+            # When batched, each block contains the 3 reset packs
+            if str2str:
+                assert (
+                    r_reset[0, 0][LLMEnv._DEFAULT_STR_KEY]
+                    == r_reset[1, 0][LLMEnv._DEFAULT_STR_KEY]
+                )
+                assert (
+                    r_reset[0, 0][LLMEnv._DEFAULT_STR_KEY]
+                    == r_reset[2, 0][LLMEnv._DEFAULT_STR_KEY]
+                )
+                assert (
+                    r_reset[0, 0][LLMEnv._DEFAULT_STR_KEY]
+                    != r_reset[0, 1][LLMEnv._DEFAULT_STR_KEY]
+                )
+            else:
+                assert (
+                    r_reset[0, 0][LLMEnv._DEFAULT_TOKEN_KEY]
+                    == r_reset[1, 0][LLMEnv._DEFAULT_TOKEN_KEY]
+                ).all()
+                assert (
+                    r_reset[0, 0][LLMEnv._DEFAULT_TOKEN_KEY]
+                    == r_reset[2, 0][LLMEnv._DEFAULT_TOKEN_KEY]
+                ).all()
+                assert (
+                    r_reset[0, 0][LLMEnv._DEFAULT_TOKEN_KEY]
+                    != r_reset[0, 1][LLMEnv._DEFAULT_TOKEN_KEY]
+                ).any()
+
+    @pytest.mark.parametrize(
+        "str2str,stack_method",
+        [
+            [True, None],
+            [False, "as_padded_tensor"],
+        ],
+    )
+    @pytest.mark.parametrize("batched", [True])
+    @pytest.mark.parametrize("device", [None])
+    @pytest.mark.parametrize("batch_size", [4])
+    @pytest.mark.parametrize("repeats", [3])
+    @pytest.mark.parametrize(
+        "assign_reward,assign_done", [[True, False], [True, True], [False, True]]
+    )
+    def test_done_and_reward(
+        self,
+        str2str,
+        batched,
+        stack_method,
+        device,
+        batch_size,
+        repeats,
+        assign_reward,
+        assign_done,
+    ):
+        with pytest.raises(
+            ValueError, match="str2str"
+        ) if str2str else contextlib.nullcontext():
+            if str2str:
+                kwargs = {
+                    "dataloader": self.DummyDataLoader(batch_size=batch_size),
+                    "data_keys": [LLMEnv._DEFAULT_STR_KEY],
+                    "example_data": "a string!",
+                    "repeats": repeats,
+                    "assign_reward": assign_reward,
+                    "assign_done": assign_done,
+                }
+            else:
+                if stack_method is None:
+                    stack_method = as_padded_tensor
+                kwargs = {
+                    "dataloader": self.DummyTensorDataLoader(
+                        padding=True, batch_size=batch_size
+                    ),
+                    "data_keys": [LLMEnv._DEFAULT_TOKEN_KEY],
+                    "data_specs": [Unbounded(shape=(-1,), dtype=torch.int64)],
+                    "stack_method": stack_method,
+                    "repeats": repeats,
+                    "assign_reward": assign_reward,
+                    "assign_done": assign_done,
+                }
+            kwargs.update(
+                {
+                    "str2str": str2str,
+                    "device": device,
+                    "has_attention": False,
+                    "no_stack": False,
+                }
+            )
+            env = LLMEnv.from_dataloader(**kwargs)
+            # We want to make sure that transforms that rely on the done state work appropriately
+            env.append_transform(StepCounter(max_steps=10))
+
+            def policy(td):
+                td[LLMEnv._DEFAULT_ACTION_KEY] = torch.ones(
+                    td.shape + (torch.randint(10, (1,)).item(),), dtype=torch.int64
+                )
+                return td
+
+            if batched:
+                r = env.rollout(
+                    100,
+                    policy,
+                    tensordict=TensorDict(batch_size=[3]),
+                    break_when_any_done=False,
+                )
+            else:
+                r = env.rollout(100, policy, break_when_any_done=False)
+            if assign_done:
+                assert "terminated" in r
+                assert "done" in r
+            print(r)
+
 
 if __name__ == "__main__":
     args, unknown = argparse.ArgumentParser().parse_known_args()