[Feature] Allow vLLMWrapper to generate multiple samples

Vincent Moens · Vincent Moens · commit 48111e9c5815 · 2025-03-26T15:54:06.000Z
ghstack-source-id: af58943 Pull Request resolved: #2878
diff --git a/test/test_actors.py b/test/test_actors.py
@@ -1374,6 +1374,18 @@ def _run_check_collector(self, policy):
             assert "tokens" in data
             # assert ("next", "tokens") in data
 
+    def test_generate_multiple_trajs_vllm(self, vllm_instance):
+        policy = vLLMWrapper(
+            vllm_instance,
+            return_log_probs=True,
+            generate_kwargs={"n": 10, "max_tokens": 1024},
+            inplace=False,
+        )
+        data = TensorDict(
+            text=NonTensorStack("a string", "another very long string"), batch_size=2
+        )
+        data = policy(data)
+
 
 if __name__ == "__main__":
     args, unknown = argparse.ArgumentParser().parse_known_args()
diff --git a/torchrl/modules/llm/vllm_wrapper.py b/torchrl/modules/llm/vllm_wrapper.py
@@ -10,10 +10,10 @@
 import torch
 from tensordict import (
     lazy_stack,
-    LazyStackedTensorDict,
+    maybe_dense_stack,
     NestedKey,
     TensorDict,
-    TensorDictBase, maybe_dense_stack,
+    TensorDictBase,
 )
 from tensordict.tensorclass import from_dataclass, NonTensorStack, TensorClass
 from tensordict.utils import _zip_strict, expand_as_right
@@ -61,7 +61,8 @@ class vLLMWrapper(CategoricalSequential):
         inplace (Literal[True, False, "empty"] | None, optional): Determines how the module should handle in-place
             operations. If `True`, operations will be performed in-place. If `False`, a new TensorDict instance will be
             created. If `"empty"`, the output data structure will be initialized with `input.empty()` (i.e., it will
-            conserve type, batch-size, and device). Defaults to `True`.
+            conserve type, batch-size, and device). Defaults to `True` when generating a single sample, `False`
+            otherwise.
 
     .. note:: The tokenizer is used when `from_text` is `True` to convert input text into token sequences. It is also
         required (or retrieved) when `pad_output` is `True` or when using text inputs with `generate=False` to ensure proper
@@ -125,7 +126,7 @@ def __init__(
         generate_kwargs: dict | None = None,
         tokenizer_kwargs: dict | None = None,
         pad_output: bool = False,
-        inplace: Literal[True, False, "empty"] | None = True,
+        inplace: Literal[True, False, "empty"] | None = None,
     ):
         super().__init__()
 
@@ -135,7 +136,6 @@ def __init__(
         self.from_text = from_text
         self._device = device
         self.generate = generate
-        self.inplace = inplace
         self.pad_output = pad_output
         padding_value = None
 
@@ -180,6 +180,18 @@ def __init__(
         else:
             generate_kwargs = dict(generate_kwargs)
 
+        if generate_kwargs.get("n", 1) > 1:
+            if inplace in (True, "empty"):
+                raise ValueError(
+                    "inplace must be False (or None) when generating more than one sample."
+                )
+            if inplace is None:
+                inplace = False
+        elif inplace is None:
+            inplace = True
+
+        self.inplace = inplace
+
         prompt_logprobs = False
 
         if not generate:
@@ -225,45 +237,39 @@ def forward(
         if tensordict.device:
             tensordict = tensordict.copy().clear_device_()
 
-        out = LazyStackedTensorDict(
-            *[
-                TensorDict(
-                    device=tensordict.device, batch_size=tensordict.batch_size[1:]
-                )
-                for _ in range(tensordict.shape[0])
-            ]
-        )
         if self.from_text:
             if self.generate:
-                out = self._from_vllm_generate_text(tensordict, out=out)
+                out = self._from_vllm_generate_text(tensordict)
             else:
-                out = self._from_vllm_logprobs_text(tensordict, out=out)
+                out = self._from_vllm_logprobs_text(tensordict)
         else:
             if self.generate:
-                out = self._from_vllm_generate_tokens(tensordict, out=out)
+                out = self._from_vllm_generate_tokens(tensordict)
             else:
-                out = self._from_vllm_logprobs_tokens(tensordict, out=out)
+                out = self._from_vllm_logprobs_tokens(tensordict)
         if _source_device:
             out = out.to(_source_device)
 
         if tensordict_out is None:
             if self.inplace is True:
                 tensordict_out = tensordict
             elif self.inplace is False:
-                tensordict_out = TensorDict()
+                tensordict_out = out
             elif self.inplace == "empty":
                 tensordict_out = tensordict.empty()
 
-        if tensordict_out is not None:
+        if tensordict_out is not None and tensordict_out is not out:
             result = tensordict_out
             result.update(out, keys_to_update=self.out_keys)
-        else:
+        elif tensordict_out is not out:
             result = out
             keys = list(set(self.out_keys + list(tensordict.keys(True, True))))
             return tensordict.update(result, keys_to_update=keys)
+        else:
+            result = out
         return result
 
-    def _from_vllm_generate_text(self, td, out):
+    def _from_vllm_generate_text(self, td):
         kwargs = {"sampling_params": self.sampling_params}
         args = ()
         input_ids = None
@@ -301,16 +307,22 @@ def _from_vllm_generate_text(self, td, out):
             self.token_response_key,
             self.text_response_key,
             self.token_key,
+            self.attention_mask_key,
         ]
-        out.update(tokens_out, keys_to_update=in_keys)
+        out = tokens_out.select(*in_keys, strict=False)
         # We might already have the tokens
-        if input_ids is not None:
+        if input_ids is not None and self.token_key not in out:
             out[self.token_key] = input_ids
-        if attention_mask is not None:
+        if attention_mask is not None and self.attention_mask_key not in out:
             out[self.attention_mask_key] = attention_mask
+        inputs = td.select(*self.in_keys, strict=False)
+        if inputs.ndim < out.ndim:
+            # This happens when n > 1
+            inputs = inputs.unsqueeze(-1).expand(out.shape)
+        out.update(inputs)
         return out
 
-    def _from_vllm_logprobs_text(self, td, out):
+    def _from_vllm_logprobs_text(self, td):
         text_prompt = td.get(self.text_key)
         if not isinstance(text_prompt, list):
             text_prompt = text_prompt.tolist()
@@ -358,7 +370,7 @@ def _from_vllm_logprobs_text(self, td, out):
         tokens_out = _RequestOutput_tc.from_request_output(tokens_out)
         tokens_out = tokens_out.select(
             "prompt_token_ids", "prompt_logprobs", strict=False
-        )
+        )._tensordict
 
         # we disregard the tokens from the prompt to focus on those of the response
         if self.pad_output:
@@ -378,13 +390,19 @@ def _from_vllm_logprobs_text(self, td, out):
                 [lp[..., -len(tr) :] for lp, tr in zip(lps, input_ids_response)]
             )
 
+        out = tokens_out.empty(recurse=True)
         if isinstance(input_ids_response, list):
             input_ids_response = torch.nested.nested_tensor(input_ids_response)
         out["tokens_response"] = input_ids_response
         out["log_probs"] = lps
+        inputs = td.select(*self.in_keys, strict=False)
+        if inputs.ndim < out.ndim:
+            # This happens when n > 1
+            inputs = inputs.unsqueeze(-1).expand(out.shape)
+        out.update(inputs)
         return out
 
-    def _from_vllm_generate_tokens(self, td, out):
+    def _from_vllm_generate_tokens(self, td):
         input_ids = td.get(self.token_key)
         attention_mask = td.get(self.attention_mask_key)
         input_ids_list = self._to_list(input_ids, attention_mask)
@@ -414,12 +432,18 @@ def _from_vllm_generate_tokens(self, td, out):
                     lps = tokens_response_td["log_probs"]
                     lps = torch.where(expand_as_right(~padded_values, lps), lps, 0.0)
                     tokens_response_td["log_probs"] = lps
+        out = tokens_response_td.empty(recurse=True)
         out.update(
             tokens_response_td, keys_to_update=(self.token_response_key, "log_probs")
         )
+        inputs = td.select(*self.in_keys, strict=False)
+        if inputs.ndim < out.ndim:
+            # This happens when n > 1
+            inputs = inputs.unsqueeze(-1).expand(out.shape)
+        out.update(inputs)
         return out
 
-    def _from_vllm_logprobs_tokens(self, td, out):
+    def _from_vllm_logprobs_tokens(self, td):
 
         tokens = td.get(self.token_key)
         tokens_response = td.get(self.token_response_key)
@@ -442,8 +466,14 @@ def _from_vllm_logprobs_tokens(self, td, out):
         prompt_logprobs = prompt_logprobs[..., -tokens_response.shape[-1] :]
         padded = tokens_response == self.padding_value
         prompt_logprobs = torch.where(~padded, prompt_logprobs, 0.0)
+        out = tokens_out._tensordict.empty(recurse=True)
         out.set("log_probs", prompt_logprobs)
         out.set(self.token_response_key, tokens_response)
+        inputs = td.select(*self.in_keys, strict=False)
+        if inputs.ndim < out.ndim:
+            # This happens when n > 1
+            inputs = inputs.unsqueeze(-1).expand(out.shape)
+        out.update(inputs)
         return out
 
     def _get_output_tokens_and_log_probs(self, tokens_out):
@@ -463,19 +493,21 @@ def _get_output_tokens_and_log_probs(self, tokens_out):
         if not self.pad_output:
             # Then we can safely move the input tokens, but otherwise they
             #  may need padding
-            tokens_response_td.update(
-                tokens_out.select("prompt_token_ids")
-            ).rename_key_("prompt_token_ids", self.token_key)
+            tokens_out = tokens_out.select("prompt_token_ids")
+            if tokens_out.ndim < tokens_response_td.ndim:
+                tokens_out = tokens_out.unsqueeze(1).expand(tokens_response_td.shape)
+            tokens_response_td.update(tokens_out).rename_key_(
+                "prompt_token_ids", self.token_key
+            )
 
-        if self.return_log_probs:
+        if self.return_log_probs or "logprobs" in tokens_response_td:
             tokens_response_td.rename_key_("logprobs", "log_probs")
             if self.pad_output:
                 padded_values = tokens_response_td["tokens_response"] == padding_value
                 if padded_values.any():
                     lps = tokens_response_td["log_probs"]
                     lps = torch.where(expand_as_right(~padded_values, lps), lps, 0.0)
                     tokens_response_td["log_probs"] = lps
-
         return tokens_response_td
 
     def _to_list(self, tokens, attention_mask):
@@ -553,14 +585,15 @@ def get_logprob(output):
                 self.outputs = outputs[0]
             else:
                 self.outputs = maybe_dense_stack(outputs)
-            self.prompt_logprobs = torch.tensor(
-                [
-                    v[tid].logprob if v is not None else 0.0
-                    for v, tid in _zip_strict(
-                        self.prompt_logprobs, self.prompt_token_ids
-                    )
-                ]
-            )
+            if self.prompt_logprobs is not None:
+                self.prompt_logprobs = torch.tensor(
+                    [
+                        v[tid].logprob if v is not None else 0.0
+                        for v, tid in _zip_strict(
+                            self.prompt_logprobs, self.prompt_token_ids
+                        )
+                    ]
+                )
             self.prompt_token_ids = torch.tensor(self.prompt_token_ids)
             self.num_cached_tokens = torch.tensor(self.num_cached_tokens)