Support explicitly killing procs (#408)

andrewjcg · facebook-github-bot · commit 48a071457b1f · 2025-07-02T10:44:12.000-07:00
Summary: Pull Request resolved: #408 Not super graceful yet, but adds some initial bookkeeping to support explicitly killing a proc mesh via reclaiming the alloc we put in `ProcEvents` and package in the `Keepalive` token. In particular, this adds `PyProcMesh.stop` method which consumes the keepalive token -- leaving the `PyProcMesh` in an effectively unusable state -- and uses it to stop the alloc. Reviewed By: vidhyav, mariusae Differential Revision: D77250211 fbshipit-source-id: 16b30c55cd558c9c52a188aa3364af9379bb6323
diff --git a/hyperactor_mesh/src/proc_mesh.rs b/hyperactor_mesh/src/proc_mesh.rs
@@ -526,6 +526,10 @@ impl ProcEvents {
             }
         }
     }
+
+    pub fn into_alloc(self) -> Box<dyn Alloc + Send + Sync> {
+        self.event_state.alloc
+    }
 }
 
 /// Spawns from shared ([`Arc`]) proc meshes, providing [`ActorMesh`]es with
diff --git a/monarch_hyperactor/src/proc_mesh.rs b/monarch_hyperactor/src/proc_mesh.rs
@@ -113,7 +113,7 @@ impl TrackedProcMesh {
 pub struct PyProcMesh {
     inner: SharedCell<TrackedProcMesh>,
     keepalive: Keepalive,
-    proc_events: Arc<Mutex<ProcEvents>>,
+    proc_events: SharedCell<Mutex<ProcEvents>>,
     stop_monitor_sender: mpsc::Sender<bool>,
     user_monitor_registered: AtomicBool,
 }
@@ -159,9 +159,11 @@ impl PyProcMesh {
     /// process on any proc failure.
     fn monitored(mut proc_mesh: ProcMesh, world_id: WorldId) -> Self {
         let (sender, abort_receiver) = mpsc::channel::<bool>(1);
-        let proc_events = Arc::new(Mutex::new(proc_mesh.events().unwrap()));
+        let proc_events = SharedCell::from(Mutex::new(proc_mesh.events().unwrap()));
         let monitor = tokio::spawn(Self::default_proc_mesh_monitor(
-            proc_events.clone(),
+            proc_events
+                .borrow()
+                .expect("borrowing immediately after creation"),
             world_id,
             abort_receiver,
         ));
@@ -177,7 +179,7 @@ impl PyProcMesh {
     /// The default monitor of the proc mesh for crashes. If a proc crashes, we print the reason
     /// to stderr and exit with code 1.
     async fn default_proc_mesh_monitor(
-        events: Arc<Mutex<ProcEvents>>,
+        events: SharedCellRef<Mutex<ProcEvents>>,
         world_id: WorldId,
         mut abort_receiver: mpsc::Receiver<bool>,
     ) {
@@ -197,7 +199,12 @@ impl PyProcMesh {
                         }
                     }
                 }
-                _ = abort_receiver.recv() => {
+                _ = async {
+                    tokio::select! {
+                        _ = events.preempted() => (),
+                        _ = abort_receiver.recv() => (),
+                    }
+                 } => {
                     // The default monitor is aborted, this happens when user takes over
                     // the monitoring responsibility.
                     eprintln!("stop default supervision monitor for ProcMesh {}", world_id);
@@ -320,6 +327,7 @@ impl PyProcMesh {
 
     fn stop<'py>(&self, py: Python<'py>) -> PyResult<Bound<'py, PyAny>> {
         let tracked_proc_mesh = self.inner.clone();
+        let proc_events = self.proc_events.clone();
         pyo3_async_runtimes::tokio::future_into_py(py, async move {
             async {
                 // "Take" the proc mesh wrapper.  Once we do, it should be impossible for new
@@ -333,6 +341,9 @@ impl PyProcMesh {
                 children.discard_all().await?;
                 // Finally, take ownership of the inner proc mesh, which will allowing dropping it.
                 let _proc_mesh = proc_mesh.take().await?;
+                // Grab the alloc back from `ProcEvents` and use that to stop the mesh.
+                let mut alloc = proc_events.take().await?.into_inner().into_alloc();
+                alloc.stop_and_wait().await?;
                 anyhow::Ok(())
             }
             .await?;
@@ -372,7 +383,7 @@ impl Drop for KeepaliveState {
     module = "monarch._rust_bindings.monarch_hyperactor.proc_mesh"
 )]
 pub struct PyProcMeshMonitor {
-    proc_events: Arc<Mutex<ProcEvents>>,
+    proc_events: SharedCell<Mutex<ProcEvents>>,
 }
 
 #[pymethods]
@@ -384,13 +395,22 @@ impl PyProcMeshMonitor {
     fn __anext__(&self, py: Python<'_>) -> PyResult<PyObject> {
         let events = self.proc_events.clone();
         Ok(pyo3_async_runtimes::tokio::future_into_py(py, async move {
+            let events = events
+                .borrow()
+                .map_err(|_| PyRuntimeError::new_err("`ProcEvents` is shutdown"))?;
             let mut proc_events = events.lock().await;
-            let event: Option<_> = proc_events.next().await;
-            match event {
-                Some(event) => Ok(PyProcEvent::from(event)),
-                None => Err(::pyo3::exceptions::PyStopAsyncIteration::new_err(
-                    "stop iteration",
-                )),
+            tokio::select! {
+                () = events.preempted() => {
+                    Err(PyRuntimeError::new_err("shutting down `ProcEvents`"))
+                },
+                event = proc_events.next() => {
+                    match event {
+                        Some(event) => Ok(PyProcEvent::from(event)),
+                        None => Err(::pyo3::exceptions::PyStopAsyncIteration::new_err(
+                            "stop iteration",
+                        )),
+                    }
+                }
             }
         })?
         .into())
diff --git a/python/monarch/proc_mesh.py b/python/monarch/proc_mesh.py
@@ -245,6 +245,9 @@ async def sync_workspace(self, auto_reload: bool = False) -> None:
             assert self._auto_reload_actor is not None
             await self._auto_reload_actor.reload.call()
 
+    async def stop(self) -> None:
+        await self._proc_mesh.stop()
+
 
 async def local_proc_mesh_nonblocking(
     *, gpus: Optional[int] = None, hosts: int = 1
diff --git a/python/tests/test_allocator.py b/python/tests/test_allocator.py
@@ -192,6 +192,32 @@ async def test_allocate_2d_mesh(self) -> None:
 
             self.assert_computed_world_size(values, world_size)
 
+    async def test_stop_proc_mesh(self) -> None:
+        spec = AllocSpec(AllocConstraints(), host=2, gpu=4)
+
+        # create 2x process-allocators (on their own bind addresses) to simulate 2 hosts
+        with remote_process_allocator() as host1, remote_process_allocator() as host2:
+            allocator = RemoteAllocator(
+                world_id="test_remote_allocator",
+                initializer=StaticRemoteAllocInitializer(host1, host2),
+                heartbeat_interval=_100_MILLISECONDS,
+            )
+            alloc = await allocator.allocate(spec)
+            proc_mesh = await ProcMesh.from_alloc(alloc)
+            actor = await proc_mesh.spawn("test_actor", TestActor)
+
+            await proc_mesh.stop()
+
+            with self.assertRaises(
+                RuntimeError, msg="`ProcMesh` has already been stopped"
+            ):
+                await proc_mesh.spawn("test_actor", TestActor)
+
+            # TODO(agallagher): It'd be nice to test that this just fails
+            # immediately, trying to access the wrapped actor mesh, but right
+            # now we doing casting without accessing the wrapped type.
+            del actor
+
     async def test_stacked_1d_meshes(self) -> None:
         # create two stacked actor meshes on the same host
         # each actor mesh running on separate process-allocators

Original file line number	Diff line number	Diff line change
`@@ -526,6 +526,10 @@ impl ProcEvents {`
`526`	`526`	`}`
`527`	`527`	`}`
`528`	`528`	`}`
	`529`	`+`
	`530`	`+ pub fn into_alloc(self) -> Box<dyn Alloc + Send + Sync> {`
	`531`	`+ self.event_state.alloc`
	`532`	`+ }`
`529`	`533`	`}`
`530`	`534`
`531`	`535`	/// Spawns from shared ([`Arc`]) proc meshes, providing [`ActorMesh`]es with