amend

vmoens · vmoens · commit 0b658bfa9b7d · 2025-07-04T16:53:53.000+01:00
diff --git a/sota-implementations/grpo/grpo_utils.py b/sota-implementations/grpo/grpo_utils.py
@@ -549,9 +549,10 @@ def make_env(cfg: DictConfig, devices: list[int] | None = None):
                 cond=lambda td: td["reward"] <= reward_threshold
                 and td["step_count"] < max_steps,
                 role="assistant",
-                edit_last_turn=False,
-                zero_reward=False,
+                edit_last_turn=True,
+                zero_reward=True,
                 undo_done=True,
+                random_prompt=True,
             ),
         )
         env = env.append_transform(
diff --git a/torchrl/envs/llm/transforms/reason.py b/torchrl/envs/llm/transforms/reason.py
@@ -238,18 +238,31 @@ def _replace_answer_with_prompt(self, content: str) -> str:
             The modified content with the answer replaced by the thinking prompt
         """
         # Pattern to match <answer>...</answer> with optional EOS token
+        # Use non-greedy matching and be more specific about the end
         answer_pattern = r"<answer>.*?</answer>(?:\s*<\|im_end\|>)?"
 
         # Check if there's an answer tag
         if "<answer>" in content:
             # Replace the answer section with the thinking prompt
             prompt = self.prompt
 
-            # Replace the answer section
+            # Replace the answer section, but preserve the EOS token if it exists
             modified_content = re.sub(answer_pattern, prompt, content, flags=re.DOTALL)
 
             # Clean up any trailing whitespace
             modified_content = modified_content.rstrip()
+            
+            # Ensure we end with the EOS token if the original content had it
+            if content.endswith("<|im_end|>"):
+                modified_content = modified_content.rstrip() + "<|im_end|>"
+                
+            # Ensure proper spacing around the prompt
+            if not modified_content.endswith(prompt):
+                # If the prompt wasn't properly inserted, append it
+                modified_content = content.rstrip()
+                if modified_content.endswith("<|im_end|>"):
+                    modified_content = modified_content[:-len("<|im_end|>")].rstrip()
+                modified_content = modified_content + "\n\n" + prompt + "<|im_end|>"
 
         else:
             # No answer tag found, just append the prompt