heartbeat in child processes to ensure cleanup (#435)

suo · facebook-github-bot · commit 7e71b3e00619 · 2025-07-07T13:17:52.000-07:00
Summary: Pull Request resolved: #435 In D77392241, I removed the signal-based cleanup mechanism. The assumption there was that the `rx.recv()` in `bootstrap.rs` would throw an error if the other side hung up, and the error would bubble up and eventually abort the child process. This assumption is wrong; `rx.recv()` has server-like semantics, not channel-like (which makes sense; there could be many senders, so any individual one disappearing should not abort the receiver!). But as a result, we were not cleaning up properly if the parent process exited, child processes would just hang around forever. The test added in D77348271 *happened* to pass, because we sent `SIGKILL` to the parent process, triggering unclean shutdown which *will* cause a error on the receiver side. However, a *graceful* shutdown (e.g. from an uncaught Python exception) will not. A quick solution is to add a simple heartbeat task and kill the process if it fails. ghstack-source-id: 294670138 exported-using-ghexport Reviewed By: ahmadsharif1 Differential Revision: D77802426 fbshipit-source-id: 0b7f7b76aa528a22b3921d897c9b04d20914cc69
diff --git a/hyperactor_mesh/src/alloc/process.rs b/hyperactor_mesh/src/alloc/process.rs
@@ -331,12 +331,6 @@ impl ProcessAlloc {
         cmd.stdout(Stdio::piped());
         cmd.stderr(Stdio::piped());
 
-        // Opt-in to signal handling (`PR_SET_PDEATHSIG`) so that the
-        // spawned subprocess will automatically exit when the parent
-        // process dies.
-        // TODO: Use hyperactor::config::global::MANAGED_SUBPROCESS_ENV once it's defined
-        cmd.env("HYPERACTOR_MANAGED_SUBPROCESS", "1");
-
         let proc_id = ProcId(WorldId(self.name.to_string()), index);
         tracing::debug!("Spawning process {:?}", cmd);
         match cmd.spawn() {
@@ -430,6 +424,9 @@ impl Alloc for ProcessAlloc {
                                 addr,
                             });
                         }
+                        Process2AllocatorMessage::Heartbeat => {
+                            tracing::debug!("recv heartbeat from {index}");
+                        }
                     }
                 },
 
diff --git a/hyperactor_mesh/src/bootstrap.rs b/hyperactor_mesh/src/bootstrap.rs
@@ -16,6 +16,8 @@ use hyperactor::channel::ChannelAddr;
 use hyperactor::channel::ChannelTransport;
 use hyperactor::channel::Rx;
 use hyperactor::channel::Tx;
+use hyperactor::clock::Clock;
+use hyperactor::clock::RealClock;
 use hyperactor::mailbox::MailboxServer;
 use serde::Deserialize;
 use serde::Serialize;
@@ -44,6 +46,8 @@ pub(crate) enum Process2AllocatorMessage {
     /// after instruction by the allocator through the corresponding
     /// [`Allocator2Process`] message.
     StartedProc(ProcId, ActorRef<MeshAgent>, ChannelAddr),
+
+    Heartbeat,
 }
 
 /// Messages sent from the allocator to a process.
@@ -62,6 +66,43 @@ pub(crate) enum Allocator2Process {
     Exit(i32),
 }
 
+async fn exit_if_missed_heartbeat(bootstrap_index: usize, bootstrap_addr: ChannelAddr) {
+    let tx = match channel::dial(bootstrap_addr.clone()) {
+        Ok(tx) => tx,
+
+        Err(err) => {
+            tracing::error!(
+                "Failed to establish heartbeat connection to allocator, exiting! (addr: {:?}): {}",
+                bootstrap_addr,
+                err
+            );
+            std::process::exit(1);
+        }
+    };
+    tracing::info!(
+        "Heartbeat connection established to allocator (idx: {bootstrap_index}, addr: {bootstrap_addr:?})",
+    );
+    loop {
+        RealClock.sleep(Duration::from_secs(5)).await;
+
+        let result = tx
+            .send(Process2Allocator(
+                bootstrap_index,
+                Process2AllocatorMessage::Heartbeat,
+            ))
+            .await;
+
+        if let Err(err) = result {
+            tracing::error!(
+                "Heartbeat failed to allocator, exiting! (addr: {:?}): {}",
+                bootstrap_addr,
+                err
+            );
+            std::process::exit(1);
+        }
+    }
+}
+
 /// Entry point to processes managed by hyperactor_mesh. This advertises the process
 /// to a bootstrap server, and receives instructions to manage the lifecycle(s) of
 /// procs within this process.
@@ -86,15 +127,15 @@ pub async fn bootstrap() -> anyhow::Error {
             .parse()?;
         let listen_addr = ChannelAddr::any(bootstrap_addr.transport());
         let (serve_addr, mut rx) = channel::serve(listen_addr).await?;
-        let tx = channel::dial(bootstrap_addr)?;
+        let tx = channel::dial(bootstrap_addr.clone())?;
 
-        {
-            tx.send(Process2Allocator(
-                bootstrap_index,
-                Process2AllocatorMessage::Hello(serve_addr),
-            ))
-            .await?;
-        }
+        tx.send(Process2Allocator(
+            bootstrap_index,
+            Process2AllocatorMessage::Hello(serve_addr),
+        ))
+        .await?;
+
+        tokio::spawn(exit_if_missed_heartbeat(bootstrap_index, bootstrap_addr));
 
         let mut procs = Vec::new();
 
diff --git a/python/tests/error_test_binary.py b/python/tests/error_test_binary.py
@@ -48,6 +48,13 @@ async def await_then_error(self) -> None:
         await asyncio.sleep(0.1)
         raise RuntimeError("oh noez")
 
+    @endpoint
+    async def get_pid(self) -> int:
+        """Endpoint that returns the process PID."""
+        import os
+
+        return os.getpid()
+
 
 class ErrorActorSync(Actor):
     """An actor that has endpoints cause segfaults."""
@@ -79,8 +86,7 @@ def _run_error_test_sync(num_procs, sync_endpoint, endpoint_name):
     error_actor = proc.spawn("error_actor", actor_class).get()
 
     # This output is checked in the test to make sure that the process actually got here
-    print("I actually ran")
-    sys.stdout.flush()
+    print("Started function error_test", flush=True)
 
     if endpoint_name == "cause_segfault":
         endpoint = error_actor.cause_segfault
@@ -110,8 +116,7 @@ async def run_test():
         error_actor = await proc.spawn("error_actor", actor_class)
 
         # This output is checked in the test to make sure that the process actually got here
-        print("I actually ran")
-        sys.stdout.flush()
+        print("Started function error_test", flush=True)
 
         if endpoint_name == "cause_segfault":
             endpoint = error_actor.cause_segfault
@@ -153,15 +158,13 @@ def error_endpoint(num_procs, sync_test_impl, sync_endpoint, endpoint_name):
 
 @main.command("error-bootstrap")
 def error_bootstrap():
-    print("I actually ran")
-    sys.stdout.flush()
+    print("Started function error_bootstrap", flush=True)
 
     proc_mesh(gpus=4, env={"MONARCH_ERROR_DURING_BOOTSTRAP_FOR_TESTING": "1"}).get()
 
 
 async def _error_unmonitored():
-    print("I actually ran")
-    sys.stdout.flush()
+    print("Started function _error_unmonitored", flush=True)
 
     proc = await proc_mesh(gpus=1)
     actor = await proc.spawn("error_actor", ErrorActor)
@@ -204,5 +207,41 @@ def error_unmonitored():
     asyncio.run(_error_unmonitored())
 
 
+async def _error_cleanup():
+    """Test function that spawns an 8 process procmesh and calls an endpoint that returns a normal exception."""
+    print("Started function _error_cleanup() for parent process", flush=True)
+
+    # Spawn an 8 process procmesh
+    proc = await proc_mesh(gpus=8)
+    error_actor = await proc.spawn("error_actor", ErrorActor)
+
+    print("Procmesh spawned, collecting child PIDs from actors", flush=True)
+
+    # Get PIDs from all actor processes
+    try:
+        # Call get_pid endpoint on all actors to collect their PIDs
+        pids = await error_actor.get_pid.call()
+        child_pids = [str(pid) for _, pid in pids]
+        print(f"CHILD_PIDS: {','.join(child_pids)}", flush=True)
+    except Exception as e:
+        print(f"Error getting child PIDs from actors: {e}", flush=True)
+
+    print("About to call endpoint that raises exception", flush=True)
+
+    # Call an endpoint that raises a normal exception
+    try:
+        await error_actor.await_then_error.call()
+    except Exception as e:
+        print(f"Expected exception caught: {e}", flush=True)
+        # Re-raise to cause the process to exit with non-zero code
+        raise
+
+
+@main.command("error-cleanup")
+def error_cleanup():
+    """Command that spawns an 8 process procmesh and calls an endpoint that returns a normal exception."""
+    asyncio.run(_error_cleanup())
+
+
 if __name__ == "__main__":
     main()
diff --git a/python/tests/test_actor_error.py b/python/tests/test_actor_error.py
@@ -140,7 +140,7 @@ def test_actor_supervision(num_procs, sync_endpoint, sync_test_impl, endpoint_na
         raise
 
     # Assert that the subprocess exited with a non-zero code
-    assert "I actually ran" in process.stdout.decode()
+    assert "Started function error_test" in process.stdout.decode()
     assert (
         process.returncode != 0
     ), f"Expected non-zero exit code, got {process.returncode}"
@@ -170,7 +170,7 @@ def test_proc_mesh_bootstrap_error():
         raise
 
     # Assert that the subprocess exited with a non-zero code
-    assert "I actually ran" in process.stdout.decode()
+    assert "Started function error_bootstrap" in process.stdout.decode()
     assert (
         process.returncode != 0
     ), f"Expected non-zero exit code, got {process.returncode}"
@@ -234,12 +234,98 @@ async def test_exception_after_wait_unmonitored():
         raise
 
     # Assert that the subprocess exited with a non-zero code
-    assert "I actually ran" in process.stdout.decode()
+    assert "Started function _error_unmonitored" in process.stdout.decode()
     assert (
         process.returncode != 0
     ), f"Expected non-zero exit code, got {process.returncode}"
 
 
+# oss_skip: importlib not pulling resource correctly in git CI, needs to be revisited
+@pytest.mark.oss_skip
+def test_python_actor_process_cleanup():
+    """
+    Test that PythonActor processes are cleaned up when the parent process dies.
+
+    This test spawns an 8 process procmesh and calls an endpoint that returns a normal exception,
+    then verifies that all spawned processes have been cleaned up after the spawned binary dies.
+    """
+    import os
+    import signal
+    import time
+
+    # Run the error-cleanup test in a subprocess
+    test_bin = importlib.resources.files("monarch.python.tests").joinpath("test_bin")
+    cmd = [
+        str(test_bin),
+        "error-cleanup",
+    ]
+
+    try:
+        print("running cmd", " ".join(cmd))
+        process = subprocess.run(cmd, capture_output=True, timeout=180, text=True)
+    except subprocess.TimeoutExpired as e:
+        print("timeout expired")
+        if e.stdout is not None:
+            print(e.stdout.decode())
+        if e.stderr is not None:
+            print(e.stderr.decode())
+        raise
+
+    # Read stdout line by line to get child PIDs
+    assert "Started function _error_cleanup() for parent process" in process.stdout
+
+    child_pids = set()
+    for line in process.stdout.splitlines():
+        if line.startswith("CHILD_PIDS: "):
+            pids_str = line[len("CHILD_PIDS: ") :]  # noqa
+            child_pids = {
+                int(pid.strip()) for pid in pids_str.split(",") if pid.strip()
+            }
+            print(f"Extracted child PIDs: {child_pids}")
+            break
+
+    if not child_pids:
+        raise AssertionError("No child PIDs found in output")
+
+    assert child_pids, "No child PIDs were collected from subprocess output"
+
+    # Wait for child processes to be cleaned up
+    print("Waiting for child processes to be cleaned up...")
+    cleanup_timeout = 120
+    start_time = time.time()
+
+    def is_process_running(pid):
+        """Check if a process with the given PID is still running."""
+        try:
+            os.kill(pid, 0)  # Signal 0 doesn't kill, just checks if process exists
+            return True
+        except OSError:
+            return False
+
+    still_running = set(child_pids)
+
+    while time.time() - start_time < cleanup_timeout:
+        if not still_running:
+            print("All child processes have been cleaned up!")
+            return
+
+        still_running = {pid for pid in still_running if is_process_running(pid)}
+
+        print(f"Still running child PIDs: {still_running}")
+        time.sleep(2)
+
+    # If we get here, some processes are still running
+    # Try to clean up remaining processes
+    for pid in still_running:
+        try:
+            os.kill(pid, signal.SIGKILL)
+        except OSError:
+            pass
+    raise AssertionError(
+        f"Child processes not cleaned up after {cleanup_timeout}s: {still_running}"
+    )
+
+
 class ErrorActor(Actor):
     def __init__(self, message):
         raise RuntimeError("fail on init")