[BugFix] Wrappers stack fn (#3061)

vmoens · vmoens · commit 19f3bd530197 · 2025-07-11T16:21:30.000+01:00
diff --git a/test/llm/test_wrapper.py b/test/llm/test_wrapper.py
@@ -1192,6 +1192,8 @@ def test_retrieve_kl_input_modes(
             ref_model=ref_model,
             assistant_only=assistant_only,
             tokenizer=tokenizer,
+            gen_log_probs_full_key=("gen_log_probs", "full"),
+            ref_log_probs_full_key=("ref_log_probs", "full"),
         )
 
         # Apply transform
@@ -1202,15 +1204,15 @@ def test_retrieve_kl_input_modes(
         # Check that both log-probs and KL are present
         assert ("gen_log_probs", "full") in result
         assert ("ref_log_probs", "full") in result
-        assert "kl" in result
+        assert "kl_penalty" in result
 
         # Check KL structure
         if pad_output:
-            kl = result.get("kl")
+            kl = result.get("kl_penalty")
             assert isinstance(kl, torch.Tensor)
             assert kl.shape[0] == 2  # batch size
         else:
-            kl = result.get("kl", as_list=True)
+            kl = result.get("kl_penalty", as_list=True)
             # For unpadded output, we get a list of tensors
             assert isinstance(kl, list)
             assert len(kl) == 2  # batch size
@@ -1391,7 +1393,7 @@ def test_kl_computation_transform(
 
         # Check that reward is modified
         assert "reward" in result
-        reward = result.get("reward")
+        reward = result.get("reward", as_list=True)
         assert reward is not None
 
 
diff --git a/torchrl/envs/llm/transforms/kl.py b/torchrl/envs/llm/transforms/kl.py
@@ -680,8 +680,8 @@ class RetrieveKL(Compose):
                 For other input modes (`"text"` or `"tokens"`), set `assistant_only=False`.
                 This ensures users are conscious of the limitation that assistant token identification requires structured conversation history.
 
-        gen_log_prob_full_key (str): the key where the log-probs of the generation model are stored. Defaults to `("log_probs", "full")`.
-        ref_log_prob_full_key (str): the key where the log-probs of the reference model are stored. Defaults to `("ref_log_probs", "full")`.
+        gen_log_probs_full_key (str): the key where the log-probs of the generation model are stored. Defaults to `("log_probs", "full")`.
+        ref_log_probs_full_key (str): the key where the log-probs of the reference model are stored. Defaults to `("ref_log_probs", "full")`.
         history_key (str): the key where the history is stored. Defaults to `"history"`.
         tokenizer_kwargs (dict): the keyword arguments to pass to the tokenizer to be used to apply the chat template to the history when `assistant_only` is `True`.
             To control the tokenization in the actor, pass the tokenizer kwargs to the actor constructor.
diff --git a/torchrl/modules/llm/policies/transformers_wrapper.py b/torchrl/modules/llm/policies/transformers_wrapper.py
@@ -686,7 +686,7 @@ def _from_transformers_generate_history(self, td, cfg, out) -> TensorDictBase:
             h_responses = _extract_responses_from_full_histories(
                 text_full, prompt_histories, self.chat_template_name, self.tokenizer
             )
-            history_chat_flat.response = torch.stack(h_responses)
+            history_chat_flat.response = h_responses
         result.set(self.history_key, history_chat)
         return result
 
diff --git a/torchrl/modules/llm/policies/vllm_wrapper.py b/torchrl/modules/llm/policies/vllm_wrapper.py
@@ -726,7 +726,7 @@ def _from_vllm_generate_history(
             h_responses = _extract_responses_from_full_histories(
                 text_full, prompt_histories, self.chat_template_name, self.tokenizer
             )
-            history_chat_flat.response = torch.stack(h_responses)
+            history_chat_flat.response = h_responses
         result.set(self.history_key, history_chat)
         return result
 

Original file line number	Diff line number	Diff line change
`@@ -686,7 +686,7 @@ def _from_transformers_generate_history(self, td, cfg, out) -> TensorDictBase:`
`686`	`686`	`h_responses = _extract_responses_from_full_histories(`
`687`	`687`	`text_full, prompt_histories, self.chat_template_name, self.tokenizer`
`688`	`688`	`)`
`689`		`- history_chat_flat.response = torch.stack(h_responses)`
	`689`	`+ history_chat_flat.response = h_responses`
`690`	`690`	`result.set(self.history_key, history_chat)`
`691`	`691`	`return result`
`692`	`692`
Original file line number	Diff line number	Diff line change
`@@ -726,7 +726,7 @@ def _from_vllm_generate_history(`
`726`	`726`	`h_responses = _extract_responses_from_full_histories(`
`727`	`727`	`text_full, prompt_histories, self.chat_template_name, self.tokenizer`
`728`	`728`	`)`
`729`		`- history_chat_flat.response = torch.stack(h_responses)`
	`729`	`+ history_chat_flat.response = h_responses`
`730`	`730`	`result.set(self.history_key, history_chat)`
`731`	`731`	`return result`
`732`	`732`