[Data] Implement forceful releasing of actors upon shutdown of StreamingExecutor (#51769)

alexeykudinkin · web-flow · commit eeb57f92412b · 2025-03-27T19:24:49.000-07:00
## Why are these changes needed? We've recently run into the issue where we had 1. Large pipeline execution was triggered in *tight* succession (one after another, immediately) 2. We had N GPUs available and all N used by the ActorPool 3. GPUs not being released in time before next execution begins 4. Subsequent dataset execution times out after 10m not being able to get the required GPUs Changes --- 1. Added `force` param to `PhysicalOperator.shutdown` method 2. Revisited pending/running actors release seq to kill these if it's a forced shutdown 3. Made sure shutdown seq awaits `on_exit` callback returning 4. Cleaned up a bunch of dead code ## Related issue number  ## Checks - [ ] I've signed off every commit(by using the -s flag, i.e., `git commit -s`) in this PR. - [ ] I've run `scripts/format.sh` to lint the changes in this PR. - [ ] I've included any doc changes needed for https://docs.ray.io/en/master/. - [ ] I've added any new APIs to the API Reference. For example, if I added a method in Tune, I've added it in `doc/source/tune/api/` under the corresponding `.rst` file. - [ ] I've made sure the tests are passing. Note that there might be a few flaky tests, see the recent failures at https://flakey-tests.ray.io/ - Testing Strategy - [ ] Unit tests - [ ] Release tests - [ ] This PR is not tested :( --------- Signed-off-by: Alexey Kudinkin <ak@anyscale.com>
diff --git a/python/ray/data/_internal/execution/interfaces/physical_operator.py b/python/ray/data/_internal/execution/interfaces/physical_operator.py
@@ -457,7 +457,7 @@ def internal_queue_size(self) -> int:
         """
         return 0
 
-    def shutdown(self) -> None:
+    def shutdown(self, force: bool = False) -> None:
         """Abort execution and release all resources used by this operator.
 
         This release any Ray resources acquired by this operator such as active
diff --git a/python/ray/data/_internal/execution/operators/actor_pool_map_operator.py b/python/ray/data/_internal/execution/operators/actor_pool_map_operator.py
@@ -34,6 +34,8 @@
 class ActorPoolMapOperator(MapOperator):
     """A MapOperator implementation that executes tasks on an actor pool.
 
+    NOTE: This class is NOT thread-safe
+
     This class manages the state of a pool of actors used for task execution, as well
     as dispatch of tasks to those actors.
 
@@ -273,10 +275,10 @@ def all_inputs_done(self):
         # once the bundle queue is exhausted.
         self._inputs_done = True
 
-    def shutdown(self):
+    def shutdown(self, force: bool = False):
         # We kill all actors in the pool on shutdown, even if they are busy doing work.
-        self._actor_pool.kill_all_actors()
-        super().shutdown()
+        self._actor_pool.shutdown(force=force)
+        super().shutdown(force)
 
         # Warn if the user specified a batch or block size that prevents full
         # parallelization across the actor pool. We only know this information after
@@ -462,6 +464,8 @@ class _ActorPool(AutoscalingActorPool):
     actors when the operator is done submitting work to the pool.
     """
 
+    _ACTOR_POOL_GRACEFUL_SHUTDOWN_TIMEOUT_S = 30
+
     def __init__(
         self,
         compute_strategy: ActorPoolStrategy,
@@ -485,9 +489,6 @@ def __init__(
         self._running_actors: Dict[ray.actor.ActorHandle, _ActorState] = {}
         # Actors that are not yet ready (still pending creation).
         self._pending_actors: Dict[ObjectRef, ray.actor.ActorHandle] = {}
-        # Whether actors that become idle should be eagerly killed. This is False until
-        # the first call to kill_idle_actors().
-        self._should_kill_idle_actors = False
         # Track locality matching stats.
         self._locality_hits: int = 0
         self._locality_misses: int = 0
@@ -547,7 +548,7 @@ def scale_up(self, num_actors: int) -> int:
     def scale_down(self, num_actors: int) -> int:
         num_killed = 0
         for _ in range(num_actors):
-            if self.kill_inactive_actor():
+            if self._kill_inactive_actor():
                 num_killed += 1
         return num_killed
 
@@ -575,9 +576,6 @@ def add_pending_actor(self, actor: ray.actor.ActorHandle, ready_ref: ray.ObjectR
             actor: The not-yet-ready actor to add as pending to the pool.
             ready_ref: The ready future for the actor.
         """
-        # The caller shouldn't add new actors to the pool after invoking
-        # kill_inactive_actors().
-        assert not self._should_kill_idle_actors
         self._pending_actors[ready_ref] = actor
 
     def pending_to_running(self, ready_ref: ray.ObjectRef) -> bool:
@@ -664,11 +662,6 @@ def return_actor(self, actor: ray.actor.ActorHandle):
         assert actor in self._running_actors
         assert self._running_actors[actor].num_tasks_in_flight > 0
         self._running_actors[actor].num_tasks_in_flight -= 1
-        if (
-            self._should_kill_idle_actors
-            and self._running_actors[actor].num_tasks_in_flight == 0
-        ):
-            self._remove_actor(actor)
 
     def get_pending_actor_refs(self) -> List[ray.ObjectRef]:
         return list(self._pending_actors.keys())
@@ -692,7 +685,7 @@ def num_free_slots(self) -> int:
             for running_actor in self._running_actors.values()
         )
 
-    def kill_inactive_actor(self) -> bool:
+    def _kill_inactive_actor(self) -> bool:
         """Kills a single pending or idle actor, if any actors are pending/idle.
 
         Returns whether an inactive actor was actually killed.
@@ -709,70 +702,79 @@ def _maybe_kill_pending_actor(self) -> bool:
         if self._pending_actors:
             # At least one pending actor, so kill first one.
             ready_ref = next(iter(self._pending_actors.keys()))
-            self._remove_actor(self._pending_actors[ready_ref])
             del self._pending_actors[ready_ref]
             return True
         # No pending actors, so indicate to the caller that no actors were killed.
         return False
 
     def _maybe_kill_idle_actor(self) -> bool:
-        for actor, running_actor in self._running_actors.items():
-            if running_actor.num_tasks_in_flight == 0:
+        for actor, state in self._running_actors.items():
+            if state.num_tasks_in_flight == 0:
                 # At least one idle actor, so kill first one found.
-                self._remove_actor(actor)
+                # NOTE: This is a fire-and-forget op
+                self._release_running_actor(actor)
                 return True
         # No idle actors, so indicate to the caller that no actors were killed.
         return False
 
-    def kill_all_inactive_actors(self):
-        """Kills all currently inactive actors and ensures that all actors that become
-        idle in the future will be eagerly killed.
-
-        This is called once the operator is done submitting work to the pool, and this
-        function is idempotent. Adding new pending actors after calling this function
-        will raise an error.
-        """
-        self._kill_all_pending_actors()
-        self._kill_all_idle_actors()
-
-    def kill_all_actors(self):
+    def shutdown(self, force: bool = False):
         """Kills all actors, including running/active actors.
 
         This is called once the operator is shutting down.
         """
-        self._kill_all_pending_actors()
-        self._kill_all_running_actors()
+        self._release_pending_actors(force=force)
+        self._release_running_actors(force=force)
 
-    def _kill_all_pending_actors(self):
-        for _, actor in self._pending_actors.items():
-            self._remove_actor(actor)
+    def _release_pending_actors(self, force: bool):
+        # Release pending actors from the set of pending ones
+        pending = dict(self._pending_actors)
         self._pending_actors.clear()
 
-    def _kill_all_idle_actors(self):
-        idle_actors = [
-            actor
-            for actor, running_actor in self._running_actors.items()
-            if running_actor.num_tasks_in_flight == 0
-        ]
-        for actor in idle_actors:
-            self._remove_actor(actor)
-        self._should_kill_idle_actors = True
-
-    def _kill_all_running_actors(self):
-        actors = list(self._running_actors.keys())
-        for actor in actors:
-            self._remove_actor(actor)
-
-    def _remove_actor(self, actor: ray.actor.ActorHandle):
-        """Remove the given actor from the pool."""
-        # NOTE: we remove references to the actor and let ref counting
+        if force:
+            for _, actor in pending.items():
+                # NOTE: Actors can't be brought back after being ``ray.kill``-ed,
+                #       hence we're only doing that if this is a forced release
+                ray.kill(actor)
+
+    def _release_running_actors(self, force: bool):
+        running = list(self._running_actors.keys())
+
+        on_exit_refs = []
+
+        # First release actors and collect their shutdown hook object-refs
+        for actor in running:
+            on_exit_refs.append(self._release_running_actor(actor))
+
+        # Wait for all actors to shutdown gracefully before killing them
+        ray.wait(on_exit_refs, timeout=self._ACTOR_POOL_GRACEFUL_SHUTDOWN_TIMEOUT_S)
+
+        # NOTE: Actors can't be brought back after being ``ray.kill``-ed,
+        #       hence we're only doing that if this is a forced release
+        if force:
+            for actor in running:
+                ray.kill(actor)
+
+    def _release_running_actor(
+        self, actor: ray.actor.ActorHandle
+    ) -> Optional[ObjectRef]:
+        """Remove the given actor from the pool and trigger its `on_exit` callback.
+
+        This method returns a ``ref`` to the result
+        """
+        # NOTE: By default, we remove references to the actor and let ref counting
         # garbage collect the actor, instead of using ray.kill.
-        # Because otherwise the actor cannot be restarted upon lineage reconstruction.
-        if actor in self._running_actors:
-            # Call `on_exit` to trigger `UDF.__del__` which may perform
-            # cleanup operations.
-            actor.on_exit.remote()
-            del self._running_actors[actor]
+        #
+        # Otherwise, actor cannot be reconstructed for the purposes of produced
+        # object's lineage reconstruction.
+        if actor not in self._running_actors:
+            return None
+
+        # Call `on_exit` to trigger `UDF.__del__` which may perform
+        # cleanup operations.
+        ref = actor.on_exit.remote()
+        del self._running_actors[actor]
+
+        return ref
 
     def _get_location(self, bundle: RefBundle) -> Optional[NodeIdStr]:
         """Ask Ray for the node id of the given bundle.
diff --git a/python/ray/data/_internal/execution/operators/map_operator.py b/python/ray/data/_internal/execution/operators/map_operator.py
@@ -473,7 +473,7 @@ def get_stats(self) -> StatsDict:
     def get_map_transformer(self) -> MapTransformer:
         return self._map_transformer
 
-    def shutdown(self):
+    def shutdown(self, force: bool = False):
         self._data_tasks.clear()
         self._metadata_tasks.clear()
 
diff --git a/python/ray/data/_internal/execution/operators/task_pool_map_operator.py b/python/ray/data/_internal/execution/operators/task_pool_map_operator.py
@@ -108,7 +108,7 @@ def _add_bundled_input(self, bundle: RefBundle):
         )
         self._submit_data_task(gen, bundle)
 
-    def shutdown(self):
+    def shutdown(self, force: bool = False):
         # Cancel all active tasks.
         for _, task in self._data_tasks.items():
             ray.cancel(task.get_waitable())
@@ -121,7 +121,8 @@ def shutdown(self):
                 # a different error, or cancellation failed. In all cases, we
                 # swallow the exception.
                 pass
-        super().shutdown()
+
+        super().shutdown(force)
 
     def progress_str(self) -> str:
         return ""
diff --git a/python/ray/data/_internal/execution/streaming_executor.py b/python/ray/data/_internal/execution/streaming_executor.py
@@ -220,7 +220,7 @@ def shutdown(self, exception: Optional[Exception] = None):
                 self._global_info.set_description(prog_bar_msg)
                 self._global_info.close()
             for op, state in self._topology.items():
-                op.shutdown()
+                op.shutdown(force=True)
                 state.close_progress_bars()
             if exception is None:
                 for callback in get_execution_callbacks(self._data_context):
diff --git a/python/ray/data/tests/test_actor_pool_map_operator.py b/python/ray/data/tests/test_actor_pool_map_operator.py