amend

vmoens · vmoens · commit b486c6668d56 · 2025-07-07T17:17:24.000+01:00
diff --git a/sota-implementations/grpo/grpo-async.py b/sota-implementations/grpo/grpo-async.py
@@ -177,6 +177,9 @@ def train(
     start_time = time.time()
 
     for step in range(total_steps):
+        if not collector.is_running():
+            torchrl_logger.info("Collector stopped, stopping training")
+            break
         pbar.update(1)
         pbar.set_description(f"Step {step}, writes: {replay_buffer.write_count}")
 
diff --git a/torchrl/collectors/collectors.py b/torchrl/collectors/collectors.py
@@ -1358,7 +1358,7 @@ def start(self):
         """
         if self.replay_buffer is None:
             raise RuntimeError("Replay buffer must be defined for execution.")
-        if not hasattr(self, "_thread") or not self._thread.is_alive():
+        if not self.is_running():
             self._stop = False
             self._thread = threading.Thread(target=self._run_iterator)
             self._thread.daemon = (
@@ -1371,6 +1371,9 @@ def _run_iterator(self):
             if self._stop:
                 return
 
+    def is_running(self):
+        return hasattr(self, "_thread") and self._thread.is_alive()
+
     def async_shutdown(
         self, timeout: float | None = None, close_env: bool = True
     ) -> None:
diff --git a/torchrl/collectors/llm/ray_collector.py b/torchrl/collectors/llm/ray_collector.py
@@ -170,6 +170,9 @@ def start(self):
         pending_task = self._collector.start.remote()
         return ray.get(pending_task)
 
+    def is_running(self):
+        return ray.get(self._collector.is_running.remote())
+
     def shutdown(self):
         """Shuts down the collector."""
         pending_task = self._collector.shutdown.remote()
diff --git a/torchrl/objectives/llm/grpo.py b/torchrl/objectives/llm/grpo.py
@@ -261,10 +261,6 @@ def forward(self, tensordict: TensorDictBase) -> GRPOLossOutput:
             raise ValueError(
                 f"advantage and log_weight must have the same number of dimensions, got {advantage.ndim=} and {log_weight.ndim=}"
             )
-        print(f"log_weight: {log_weight.shape}")
-        print(f"advantage: {advantage.shape}")
-        print(f"mask: {mask.shape}")
-        print(f"data: {tensordict}")
         gain1 = log_weight.exp() * advantage
 
         log_weight_clip = log_weight.clamp(*self._clip_bounds)
@@ -503,6 +499,7 @@ def _inv_call(self, tensordict: TensorDictBase) -> TensorDictBase:
                     torchrl_logger.info(f"Computing advantage for {prompt=}")
                 # Cat is the most robust way to combine the trajs
                 tds = torch.cat(list(self.queues[prompt]), -1)
+                del self.queues[prompt]
                 # Collect rewards
                 reward = tds.get(self.rewards_key, as_nested_tensor=True)
                 reward_mean = reward.values().mean()