amend

vmoens · vmoens · commit 185bff952f2b · 2025-07-04T17:02:44.000+01:00
diff --git a/sota-implementations/grpo/grpo_utils.py b/sota-implementations/grpo/grpo_utils.py
@@ -14,6 +14,7 @@
 from torchrl.collectors.llm.weight_update.vllm import vLLMUpdater
 from torchrl.envs.llm import AddThinkingPrompt, GSM8KEnv, KLRewardTransform, RetrieveKL
 from torchrl.envs.llm.datasets.ifeval import IFEvalEnv
+from torchrl.envs.llm.transforms.enhanced_reasoning import EnhancedReasoningTransform
 from torchrl.modules.llm import TransformersWrapper, vLLMWrapper
 from transformers.models.auto.modeling_auto import AutoModelForCausalLM
 from transformers.tokenization_utils import PreTrainedTokenizer
@@ -545,15 +546,24 @@ def make_env(cfg: DictConfig, devices: list[int] | None = None):
         raise NotImplementedError(f"Dataset {cfg.env.dataset} not implemented")
     if cfg.env.reasoning:
         env = env.append_transform(
-            AddThinkingPrompt(
-                cond=lambda td: td["reward"] <= reward_threshold
-                and td["step_count"] < max_steps,
-                role="assistant",
-                edit_last_turn=True,
-                zero_reward=True,
-                undo_done=True,
-                random_prompt=True,
-            ),
+            # AddThinkingPrompt(
+            #     cond=lambda td: td["reward"] <= reward_threshold
+            #     and td["step_count"] < max_steps,
+            #     role="assistant",
+            #     edit_last_turn=True,
+            #     zero_reward=True,
+            #     undo_done=True,
+            #     random_prompt=True,
+            # ),
+    EnhancedReasoningTransform(
+        cond=lambda td: td["reward"] <= 1.0 and td["step_count"] < 3,
+        strategy="user_guidance",  # User tells assistant to reconsider
+        reward_threshold=1.0,
+        max_steps=3,
+        zero_reward=True,
+        undo_done=True,
+        random_prompt=True,
+    )
         )
         env = env.append_transform(
             # RetrieveKL will be lazily initialized in the collector.
diff --git a/torchrl/envs/llm/transforms/reason.py b/torchrl/envs/llm/transforms/reason.py
@@ -251,17 +251,17 @@ def _replace_answer_with_prompt(self, content: str) -> str:
 
             # Clean up any trailing whitespace
             modified_content = modified_content.rstrip()
-            
+
             # Ensure we end with the EOS token if the original content had it
             if content.endswith("<|im_end|>"):
                 modified_content = modified_content.rstrip() + "<|im_end|>"
-                
+
             # Ensure proper spacing around the prompt
             if not modified_content.endswith(prompt):
                 # If the prompt wasn't properly inserted, append it
                 modified_content = content.rstrip()
                 if modified_content.endswith("<|im_end|>"):
-                    modified_content = modified_content[:-len("<|im_end|>")].rstrip()
+                    modified_content = modified_content[: -len("<|im_end|>")].rstrip()
                 modified_content = modified_content + "\n\n" + prompt + "<|im_end|>"
 
         else: