[Feature] ContentBase (#2985)

vmoens · web-flow · commit 840101dd8f0a · 2025-06-03T11:11:24.000+01:00
diff --git a/test/llm/test_data.py b/test/llm/test_data.py
@@ -7,12 +7,14 @@
 
 import argparse
 import importlib.util
+from typing import Mapping
 
 import pytest
 import torch
-from tensordict import set_list_to_stack
+from tensordict import lazy_stack, set_list_to_stack
 
 from torchrl.data import History
+from torchrl.data.llm.chat import ContentBase
 
 _has_transformers = importlib.util.find_spec("transformers")
 _has_vllm = importlib.util.find_spec("vllm")
@@ -216,6 +218,53 @@ def test_history_spec(self):
         assert spec.is_in(r)
         assert spec.is_in(history)
 
+    def test_content_base(self):
+        from transformers import AutoProcessor
+
+        processor = AutoProcessor.from_pretrained(
+            "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
+        )
+
+        content_text = ContentBase(type="text", text="Hello, world!")
+        content_img = ContentBase(
+            type="image",
+            url="https://github.com/pytorch/rl/blob/main/docs/source/_static/img/icon.png?raw=true",
+        )
+        content = lazy_stack([content_text, content_img])
+        history0 = History(
+            role="assistant",
+            content=ContentBase(
+                type="text",
+                text="You are going to see an image and a hello world message. Ignore both.",
+                batch_size=1,
+            ),
+        )
+        history1 = History(role="user", content=content)
+        history = lazy_stack([history0, history1])
+        proc = history.apply_chat_template(
+            tokenizer=processor,
+            add_generation_prompt=False,
+            return_dict=True,
+            tokenize=False,
+        )
+        assert (
+            proc
+            == "<|im_start|>assistant \nYou are going to see an image and a hello world message. Ignore both.<|im_end|><|im_start|>user <image>\nHello, world!<|im_end|>"
+        )
+        proc = history.apply_chat_template(
+            tokenizer=processor,
+            add_generation_prompt=False,
+            return_dict=True,
+            tokenize=True,
+        )
+        assert isinstance(proc, Mapping)
+        assert proc["input_ids"].shape == (1, 7294)
+        assert proc["attention_mask"].shape == (1, 7294)
+        assert proc["pixel_values"].shape == (1, 37, 3, 384, 384), proc[
+            "pixel_values"
+        ].shape
+        assert (proc["image_sizes"] == torch.tensor([[2096, 2324]])).all()
+
 
 if __name__ == "__main__":
     args, unknown = argparse.ArgumentParser().parse_known_args()
diff --git a/torchrl/data/llm/chat.py b/torchrl/data/llm/chat.py
@@ -34,6 +34,98 @@
 }
 
 
+# We need the 'shadow' flag to avoid having tensordict complaining about 'type'/'size' etc. fields
+class ContentBase(TensorClass["nocast", "shadow"]):
+    """Base class for all message content types.
+
+    Attributes:
+        type (str): The type of the content.
+        text (str, optional): The text content.
+        url (str, optional): The URL content.
+        data (str, optional): The data content.
+        mime_type (str, optional): The MIME type of the content.
+        name (str, optional): The name of the content.
+        size (int, optional): The size of the content.
+        function_name (str, optional): The name of the function.
+        function_args (dict, optional): The arguments of the function.
+
+    Examples:
+        >>> from tensordict import lazy_stack
+        >>> content1 = ContentBase(type="text", text="Hello, world!")
+        >>> print(content1)
+        ContentBase(
+            text=NonTensorData(data=Hello, world!, batch_size=torch.Size([]), device=None),
+            type=NonTensorData(data=text, batch_size=torch.Size([]), device=None),
+            url=None,
+            data=None,
+            mime_type=None,
+            name=None,
+            size=None,
+            function_name=None,
+            function_args=None,
+            batch_size=torch.Size([]),
+            device=None,
+            is_shared=False)
+        >>> content2 = ContentBase(type="image", url="https://example.com/image.jpg")
+        >>> print(content2)
+        ContentBase(
+            type=NonTensorData(data=image, batch_size=torch.Size([]), device=None),
+            url=NonTensorData(data=https://example.com/image.jpg, batch_size=torch.Size([]), device=None),
+            text=None,
+            data=None,
+            mime_type=None,
+            name=None,
+            size=None,
+            function_name=None,
+            function_args=None,
+            batch_size=torch.Size([]),
+            device=None,
+            is_shared=False)
+        >>> content = lazy_stack([content1, content2])
+        >>> print(content)
+        ContentBase(
+            type=NonTensorStack(
+                ['text', 'image'],
+                batch_size=torch.Size([2]),
+                device=None),
+            url=None,
+            data=None,
+            mime_type=None,
+            name=None,
+            size=None,
+            function_name=None,
+            function_args=None,
+            text=None,
+            batch_size=torch.Size([2]),
+            device=None,
+            is_shared=False)
+        >>> # A content is typically used in a History object. Usually, its batch dimension is
+        >>> #  one dimension greater than the History object.
+        >>> history = History(role="user", content=content)
+
+    """
+
+    type: Literal[
+        "text", "image", "audio", "video", "file", "function_call"
+    ]  # Required: "text", "image", "audio", "video", "file", "function_call"
+
+    # Text content
+    text: str | None = None
+
+    # Media/file content (either URL or data)
+    url: str | None = None  # HTTP URL to content
+    data: str | None = None  # Base64 encoded content
+
+    # Metadata
+    mime_type: str | None = None  # "image/jpeg", "audio/mp3", "application/pdf"
+    name: str | None = None  # Original filename or description
+    size: int | None = None  # File size in bytes
+
+    # Function calling (for AI agents)
+    function_name: str | None = None
+    function_args: dict | None = None
+
+
 class History(TensorClass["nocast"]):
     """A class representing a structured history of messages in a conversation, designed for efficient manipulation and integration with language models.
 
@@ -98,7 +190,7 @@ class History(TensorClass["nocast"]):
     """
 
     role: str
-    content: str
+    content: str | ContentBase
 
     def __post_init__(self):
         if not list_to_stack():
@@ -110,27 +202,29 @@ def __post_init__(self):
     def apply_chat_template(
         self,
         *,
-        tokenizer: transformers.AutoTokenizer,  # noqa
+        tokenizer: transformers.AutoTokenizer | transformers.AutoProcessor,  # noqa
         add_generation_prompt: bool = True,
         chat_template: str | None = None,
         continue_final_message: bool = False,
         tokenize: bool = False,
         padding: bool | str = False,
         truncation: bool | str = False,
         return_tensors: str | None = "pt",
+        return_dict: bool = False,
         **kwargs,
     ):
         """Applies a chat template to the history.
 
         Keyword Args:
-            tokenizer (transformers.PreTrainedTokenizer): The tokenizer to use.
-            add_generation_prompt (bool, optional): Whether to add a generation prompt. Defaults to True.
+            tokenizer (transformers.PreTrainedTokenizer | transformers.AutoProcessor): The tokenizer to use.
+            add_generation_prompt (bool, optional): Whether to add a generation prompt. Defaults to `True`.
             chat_template (str, optional): The chat template to use. Defaults to the tokenizer's default template.
-            continue_final_message (bool, optional): Whether to continue the final message. Defaults to False.
-            tokenize (bool, optional): Whether to tokenize the output. Defaults to False.
-            padding (bool | str, optional): The padding strategy to use. Defaults to False.
-            truncation (bool | str, optional): The truncation strategy to use. Defaults to False.
+            continue_final_message (bool, optional): Whether to continue the final message. Defaults to `False`.
+            tokenize (bool, optional): Whether to tokenize the output. Defaults to `False`.
+            padding (bool | str, optional): The padding strategy to use. Defaults to `False`.
+            truncation (bool | str, optional): The truncation strategy to use. Defaults to `False`.
             return_tensors (str | None, optional): The type of tensors to return. Defaults to "pt".
+            return_dict (bool, optional): Whether to return a dictionary. Defaults to `False`.
             **kwargs: Additional keyword arguments to pass to the tokenizer `apply_chat_template` method.
 
         Returns:
@@ -155,20 +249,24 @@ def apply_chat_template(
                     truncation=truncation,
                     return_tensors=return_tensors,
                     continue_final_message=continue_final_message,
+                    return_dict=return_dict,
                     **kwargs,
                 )
                 for i in range(self.batch_size[0])
             ]
-        self_flat = self.view(-1).tolist()
+        self_flat = self.view(-1)
+        # tolist_first=True is needed to avoid having a list of dict of dicts, but a list of dicts of lists of dicts
+        self_flat = self_flat.tolist(tolist_first=True)
         return tokenizer.apply_chat_template(
-            self_flat,
+            conversation=self_flat,
             add_generation_prompt=add_generation_prompt,
             chat_template=chat_template,
             tokenize=tokenize,
             padding=padding,
             truncation=truncation,
             return_tensors=return_tensors,
             continue_final_message=continue_final_message,
+            return_dict=return_dict,
         )
 
     @classmethod
@@ -275,7 +373,7 @@ def append(
 
         Args:
             history (History): The new history to append.
-            inplace (bool, optional): Whether to perform the operation in-place. Defaults to True.
+            inplace (bool, optional): Whether to perform the operation in-place. Defaults to `True`.
             dim (int, optional): The dimension to append along. Defaults to -1.
 
         Returns:
diff --git a/torchrl/envs/custom/llm.py b/torchrl/envs/custom/llm.py
@@ -43,7 +43,7 @@ class LLMHashingEnv(EnvBase):
         observation_key (NestedKey, optional): The key for the observation in the TensorDict.
             Defaults to "observation".
         text_output (bool, optional): Whether to include the text output in the observation.
-            Defaults to True.
+            Defaults to `True`.
         tokenizer (transformers.Tokenizer | None, optional):
             A tokenizer function that converts text to tensors.
             Only used when `text_output` is `True`.
diff --git a/torchrl/envs/llm/envs.py b/torchrl/envs/llm/envs.py
@@ -602,7 +602,7 @@ class LLMHashingEnv(EnvBase):
         observation_key (NestedKey, optional): The key for the observation in the TensorDict.
             Defaults to "observation".
         text_output (bool, optional): Whether to include the text output in the observation.
-            Defaults to True.
+            Defaults to `True`.
         tokenizer (transformers.Tokenizer | None, optional):
             A tokenizer function that converts text to tensors.
             Only used when `text_output` is `True`.
diff --git a/torchrl/envs/llm/transforms/dataloading.py b/torchrl/envs/llm/transforms/dataloading.py
@@ -234,7 +234,7 @@ class DataLoadingPrimer(TensorDictPrimer):
         ...         Args:
         ...             batch_size (int, optional): The batch size of the generated tensors. Defaults to 0.
         ...             max_length (int, optional): The maximum length of the generated tensors. Defaults to 10.
-        ...             padding (bool, optional): Whether to pad the tensors to the maximum length. Defaults to False.
+        ...             padding (bool, optional): Whether to pad the tensors to the maximum length. Defaults to `False`.
         ...         '''
         ...         self.batch_size = batch_size
         ...         self.max_length = max_length
diff --git a/torchrl/envs/transforms/r3m.py b/torchrl/envs/transforms/r3m.py
@@ -214,7 +214,7 @@ class R3MTransform(Compose):
             If the torchvision weights are needed, there are two ways they can be
             obtained: :obj:`download=ResNet50_Weights.IMAGENET1K_V1` or :obj:`download="IMAGENET1K_V1"`
             where :obj:`ResNet50_Weights` can be imported via :obj:`from torchvision.models import resnet50, ResNet50_Weights`.
-            Defaults to False.
+            Defaults to `False`.
         download_path (str, optional): path where to download the models.
             Default is None (cache path determined by torch.hub utils).
         tensor_pixels_keys (list of str, optional): Optionally, one can keep the
diff --git a/torchrl/envs/transforms/transforms.py b/torchrl/envs/transforms/transforms.py
@@ -5835,7 +5835,7 @@ class DiscreteActionProjection(Transform):
         action_key (NestedKey, optional): key name of the action. Defaults to "action".
         include_forward (bool, optional): if ``True``, a call to forward will also
             map the action from one domain to the other when the module is called
-            by a replay buffer or an nn.Module chain. Defaults to True.
+            by a replay buffer or an nn.Module chain. Defaults to `True`.
 
     Examples:
         >>> torch.manual_seed(0)
diff --git a/torchrl/envs/transforms/vip.py b/torchrl/envs/transforms/vip.py
@@ -186,7 +186,7 @@ class VIPTransform(Compose):
             If the torchvision weights are needed, there are two ways they can be
             obtained: :obj:`download=ResNet50_Weights.IMAGENET1K_V1` or :obj:`download="IMAGENET1K_V1"`
             where :obj:`ResNet50_Weights` can be imported via :obj:`from torchvision.models import resnet50, ResNet50_Weights`.
-            Defaults to False.
+            Defaults to `False`.
         download_path (str, optional): path where to download the models.
             Default is None (cache path determined by torch.hub utils).
         tensor_pixels_keys (list of str, optional): Optionally, one can keep the
diff --git a/torchrl/envs/utils.py b/torchrl/envs/utils.py
@@ -704,7 +704,7 @@ def check_env_specs(
             return_contiguous=True. This will fail in some cases (e.g. heterogeneous shapes
             of inputs/outputs). Defaults to ``None`` (determined by the presence of dynamic specs).
         check_dtype (bool, optional): if False, dtype checks will be skipped.
-            Defaults to True.
+            Defaults to `True`.
         seed (int, optional): for reproducibility, a seed can be set.
             The seed will be set in pytorch temporarily, then the RNG state will
             be reverted to what it was before. For the env, we set the seed but since
diff --git a/torchrl/trainers/helpers/models.py b/torchrl/trainers/helpers/models.py
@@ -216,7 +216,7 @@ def make_dreamer(
         value_key (str, optional): Key to use for the value.
             Defaults to "state_value".
         use_decoder_in_env (bool, optional): Whether to use the decoder in the model based dreamer env.
-            Defaults to False.
+            Defaults to `False`.
         obs_norm_state_dict (dict, optional): the state_dict of the ObservationNorm transform used
             when proof_environment is missing. Defaults to None.