[Feature] History API

Vincent Moens · Vincent Moens · commit fd10fe213efb · 2025-04-04T09:57:21.000+01:00
ghstack-source-id: 5b9723f Pull Request resolved: #2890
diff --git a/docs/source/reference/data.rst b/docs/source/reference/data.rst
@@ -1107,10 +1107,10 @@ and the tree can be expanded for each of these. The following figure shows how t
     Tree
 
 
-Reinforcement Learning From Human Feedback (RLHF)
--------------------------------------------------
+Large language models and Reinforcement Learning From Human Feedback (RLHF)
+---------------------------------------------------------------------------
 
-Data is of utmost importance in Reinforcement Learning from Human Feedback (RLHF).
+Data is of utmost importance in LLM post-training (e.g., GRPO or Reinforcement Learning from Human Feedback (RLHF)).
 Given that these techniques are commonly employed in the realm of language,
 which is scarcely addressed in other subdomains of RL within the library,
 we offer specific utilities to facilitate interaction with external libraries
@@ -1124,6 +1124,7 @@ efficient sampling.
     :toctree: generated/
     :template: rl_template.rst
 
+    History
     PairwiseDataset
     PromptData
     PromptTensorDictTokenizer
diff --git a/test/test_rb.py b/test/test_rb.py
@@ -42,6 +42,7 @@
     is_tensor_collection,
     is_tensorclass,
     LazyStackedTensorDict,
+    set_list_to_stack,
     tensorclass,
     TensorDict,
     TensorDictBase,
@@ -54,6 +55,7 @@
 from torchrl.collectors.utils import split_trajectories
 from torchrl.data import (
     FlatStorageCheckpointer,
+    History,
     MultiStep,
     NestedStorageCheckpointer,
     PrioritizedReplayBuffer,
@@ -127,6 +129,7 @@
 _has_gym = importlib.util.find_spec("gym") is not None
 _has_snapshot = importlib.util.find_spec("torchsnapshot") is not None
 _os_is_windows = sys.platform == "win32"
+_has_transformers = importlib.util.find_spec("transformers") is not None
 TORCH_VERSION = version.parse(version.parse(torch.__version__).base_version)
 
 torch_2_3 = version.parse(
@@ -3916,6 +3919,185 @@ def test_multi_env(self, storage_type, checkpointer, tmpdir, frames_per_batch):
             assert rb._writer._cursor == rb_test._writer._cursor
 
 
+class TestHistory:
+    @pytest.fixture(scope="class", autouse=True)
+    def set_context(self):
+        with set_list_to_stack(True):
+            yield
+
+    def test_history_construct(self):
+        hst0 = History(role="user", content="a message")
+        assert not hst0.shape
+        hst1 = History(role="user", content="another message")
+        with pytest.raises(RuntimeError, match="unsqueeze"):
+            hst0.append(hst1)
+        hst0 = hst0.unsqueeze(0)
+
+        # In an env.step, we typically have one more piece of history to add to the stack
+        assert not hst1.shape
+        assert not hst1.batch_size
+        assert not hst1.batch_dims
+        # test out-place
+        hst0_copy = hst0.copy()
+        hst0b = hst0.append(hst1, inplace=False)
+        assert hst0b is not hst0
+        assert (hst0 == hst0_copy).all()
+        assert (hst0b[:-1] == hst0).all()
+
+        # test in-place
+        hst0b = hst0.append(hst1)
+        assert hst0b is hst0
+        assert hst0b.shape == (2,)
+
+        assert hst0b.content == ["a message", "another message"]
+        hst2 = History(
+            role=["assistant", "user"],
+            content=["i'm the assistant", "i'm the user"],
+            batch_size=2,
+        )
+        assert hst2[0].role == "assistant"
+        assert hst2[0].content == "i'm the assistant"
+        assert hst2[1].role == "user"
+        assert hst2[1].content == "i'm the user"
+        with pytest.raises(RuntimeError, match="The new history to extend"):
+            hst0.extend(hst1)
+
+        # test out-place
+        hst0_copy = hst0.copy()
+        hst0b = hst0.extend(hst2, inplace=False)
+        assert hst0b is not hst0
+        assert (hst0 == hst0_copy).all()
+        assert (hst0b[:-2] == hst0).all()
+
+        # test in-place
+        hst0b = hst0.extend(hst2)
+
+        assert hst0b is hst0
+        assert hst0.__dict__["_tensordict"].shape == (4,)
+        assert hst0.shape == (4,)
+        assert hst0.role == ["user", "user", "assistant", "user"]
+        assert hst0.content == [
+            "a message",
+            "another message",
+            "i'm the assistant",
+            "i'm the user",
+        ]
+
+    def test_history_construct_ndim(self):
+        hst0 = History(role="user", content="a message").unsqueeze(0).unsqueeze(0)
+        hst1 = History(role="user", content="another message").unsqueeze(0)
+
+        # test out-place
+        hst0_copy = hst0.copy()
+        hst0b = hst0.append(hst1, inplace=False, dim=1)
+        assert hst0b is not hst0
+        assert (hst0 == hst0_copy).all()
+        assert (hst0b[:, :-1] == hst0).all()
+
+        # test in-place
+        hst0b = hst0.append(hst1, dim=1)
+        assert hst0b is hst0
+        assert hst0b.shape == (
+            1,
+            2,
+        )
+
+        assert hst0b.content == [["a message", "another message"]]
+        hst2 = History(
+            role=["assistant", "user"],
+            content=["i'm the assistant", "i'm the user"],
+            batch_size=2,
+        ).unsqueeze(0)
+
+        # test out-place
+        hst0_copy = hst0.copy()
+        hst0b = hst0.extend(hst2, inplace=False, dim=1)
+        assert hst0b is not hst0
+        assert (hst0 == hst0_copy).all()
+        assert (hst0b[:, :-2] == hst0).all()
+
+        # test in-place
+        hst0b = hst0.extend(hst2, dim=1)
+
+        assert hst0b is hst0
+        assert hst0.__dict__["_tensordict"].shape == (
+            1,
+            4,
+        )
+        assert hst0.shape == (
+            1,
+            4,
+        )
+        assert hst0.role == [["user", "user", "assistant", "user"]]
+        assert hst0.content == [
+            [
+                "a message",
+                "another message",
+                "i'm the assistant",
+                "i'm the user",
+            ]
+        ]
+
+    @pytest.fixture(scope="class")
+    def mock_history(self):
+        history0 = History(
+            role="system",
+            content="""CONTENT
+        This is the setup""",
+        )
+        history1 = History(
+            role="user",
+            content="""CONTENT
+        This is the first user prompt""",
+        )
+        history2 = History(
+            role="assistant",
+            content="""CONTENT
+        This is the second prompt, the first for the assistant.""",
+        )
+        history = torch.stack([history0, history1, history2])
+        return history
+
+    @pytest.fixture(scope="class")
+    def tokenizer(self):
+        from transformers import AutoTokenizer
+
+        tokenizer = AutoTokenizer.from_pretrained("GPT2")
+        yield tokenizer
+
+    @pytest.mark.skipif(not _has_transformers, reason="requires transformers library")
+    def test_history_template(self, mock_history, tokenizer):
+        history = mock_history
+        data_str = history.apply_chat_template(
+            tokenizer=tokenizer, add_generation_prompt=False
+        )
+        assert isinstance(data_str, str)
+        data_token = history.apply_chat_template(
+            tokenizer=tokenizer, tokenize=True, add_generation_prompt=False
+        )
+        assert isinstance(data_token, torch.Tensor)
+
+        # test add_generation_prompt
+        data_str = history.apply_chat_template(
+            tokenizer=tokenizer, add_generation_prompt=True
+        )
+        assert isinstance(data_str, str)
+        assert data_str.endswith("<|im_start|>assistant\n"), data_str
+
+    @pytest.mark.skipif(not _has_transformers, reason="requires transformers library")
+    def test_history_template_recover(self, mock_history, tokenizer):
+        history = mock_history
+        data_str = history.apply_chat_template(tokenizer=tokenizer)
+        # Test inverse
+        recovered = history._inv_chatml(data_str)
+        assert recovered.role == history.role
+        assert recovered.content == history.content
+        data_token = history.apply_chat_template(
+            tokenizer=tokenizer, tokenize=True, add_generation_prompt=False
+        )
+        recovered = history._inv_chatml(tokenizer.batch_decode(data_token)[0])
+
+
 if __name__ == "__main__":
     args, unknown = argparse.ArgumentParser().parse_known_args()
     pytest.main([__file__, "--capture", "no", "--exitfirst"] + unknown)
diff --git a/torchrl/data/__init__.py b/torchrl/data/__init__.py
@@ -8,6 +8,7 @@
     ConstantKLController,
     create_infinite_iterator,
     get_dataloader,
+    History,
     LLMData,
     LLMInput,
     LLMOutput,
@@ -108,6 +109,7 @@
 
 __all__ = [
     "AdaptiveKLController",
+    "History",
     "Binary",
     "BinaryDiscreteTensorSpec",
     "BinaryToDecimal",
diff --git a/torchrl/data/llm/__init__.py b/torchrl/data/llm/__init__.py
@@ -3,6 +3,7 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
+from .chat import History
 from .dataset import (
     create_infinite_iterator,
     get_dataloader,
@@ -35,4 +36,5 @@
     "TokenizedDatasetLoader",
     "create_infinite_iterator",
     "get_dataloader",
+    "History",
 ]
diff --git a/torchrl/data/llm/chat.py b/torchrl/data/llm/chat.py
diff --git a/torchrl/envs/transforms/llm.py b/torchrl/envs/transforms/llm.py