Skip to content

Commit 0e4f4a9

Browse files
committed
[10/n] restore debugger support to mesh_controller
Pull Request resolved: #433 moves the debugger support directly into the controller actor so there is no need to shuttle messages back/forth from the client. Differential Revision: [D77771081](https://our.internmc.facebook.com/intern/diff/D77771081/) **NOTE FOR REVIEWERS**: This PR has internal Meta-specific changes or comments, please review them on [Phabricator](https://our.internmc.facebook.com/intern/diff/D77771081/)! ghstack-source-id: 294608438
1 parent fe29f6e commit 0e4f4a9

File tree

3 files changed

+85
-6
lines changed

3 files changed

+85
-6
lines changed

monarch_extension/src/mesh_controller.rs

Lines changed: 80 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
use std::collections::BTreeMap;
1010
use std::collections::HashMap;
1111
use std::collections::HashSet;
12+
use std::collections::VecDeque;
1213
use std::error::Error;
1314
use std::fmt::Debug;
1415
use std::fmt::Formatter;
@@ -21,6 +22,7 @@ use std::sync::atomic::AtomicUsize;
2122
use async_trait::async_trait;
2223
use hyperactor::Actor;
2324
use hyperactor::ActorHandle;
25+
use hyperactor::ActorId;
2426
use hyperactor::ActorRef;
2527
use hyperactor::Context;
2628
use hyperactor::HandleClient;
@@ -30,7 +32,6 @@ use hyperactor::PortRef;
3032
use hyperactor::cap::CanSend;
3133
use hyperactor::mailbox::MailboxSenderError;
3234
use hyperactor_mesh::Mesh;
33-
use hyperactor_mesh::ProcMesh;
3435
use hyperactor_mesh::actor_mesh::RootActorMesh;
3536
use hyperactor_mesh::shared_cell::SharedCell;
3637
use hyperactor_mesh::shared_cell::SharedCellRef;
@@ -44,6 +45,9 @@ use monarch_messages::controller::ControllerActor;
4445
use monarch_messages::controller::ControllerMessage;
4546
use monarch_messages::controller::Seq;
4647
use monarch_messages::controller::WorkerError;
48+
use monarch_messages::debugger::DebuggerAction;
49+
use monarch_messages::debugger::DebuggerActor;
50+
use monarch_messages::debugger::DebuggerMessage;
4751
use monarch_messages::worker::Ref;
4852
use monarch_messages::worker::WorkerMessage;
4953
use monarch_messages::worker::WorkerParams;
@@ -611,12 +615,84 @@ struct MeshControllerActor {
611615
workers: Option<SharedCell<RootActorMesh<'static, WorkerActor>>>,
612616
history: History,
613617
id: usize,
618+
debugger_active: Option<ActorRef<DebuggerActor>>,
619+
debugger_paused: VecDeque<ActorRef<DebuggerActor>>,
614620
}
615621

616622
impl MeshControllerActor {
617623
fn workers(&self) -> SharedCellRef<RootActorMesh<'static, WorkerActor>> {
618624
self.workers.as_ref().unwrap().borrow().unwrap()
619625
}
626+
fn handle_debug(
627+
&mut self,
628+
this: &Context<Self>,
629+
debugger_actor_id: ActorId,
630+
action: DebuggerAction,
631+
) -> anyhow::Result<()> {
632+
if matches!(action, DebuggerAction::Paused()) {
633+
self.debugger_paused
634+
.push_back(ActorRef::attest(debugger_actor_id));
635+
} else {
636+
let debugger_actor = self
637+
.debugger_active
638+
.as_ref()
639+
.ok_or_else(|| anyhow::anyhow!("no active debugger"))?;
640+
if debugger_actor_id != *debugger_actor.actor_id() {
641+
anyhow::bail!("debugger action for wrong actor");
642+
}
643+
match action {
644+
DebuggerAction::Detach() => {
645+
self.debugger_active = None;
646+
}
647+
DebuggerAction::Read { requested_size } => {
648+
Python::with_gil(|py| {
649+
let read = py
650+
.import("monarch.controller.debugger")
651+
.unwrap()
652+
.getattr("read")
653+
.unwrap();
654+
let bytes: Vec<u8> =
655+
read.call1((requested_size,)).unwrap().extract().unwrap();
656+
657+
debugger_actor.send(
658+
this,
659+
DebuggerMessage::Action {
660+
action: DebuggerAction::Write { bytes },
661+
},
662+
)
663+
})?;
664+
}
665+
DebuggerAction::Write { bytes } => {
666+
Python::with_gil(|py| -> Result<(), anyhow::Error> {
667+
let write = py
668+
.import("monarch.controller.debugger")
669+
.unwrap()
670+
.getattr("write")
671+
.unwrap();
672+
write.call1((String::from_utf8(bytes)?,)).unwrap();
673+
Ok(())
674+
})?;
675+
}
676+
_ => {
677+
anyhow::bail!("unexpected action: {:?}", action);
678+
}
679+
}
680+
}
681+
if self.debugger_active.is_none() {
682+
self.debugger_active = self.debugger_paused.pop_front().and_then(|pdb_actor| {
683+
pdb_actor
684+
.send(
685+
this,
686+
DebuggerMessage::Action {
687+
action: DebuggerAction::Attach(),
688+
},
689+
)
690+
.map(|_| pdb_actor)
691+
.ok()
692+
});
693+
}
694+
Ok(())
695+
}
620696
}
621697

622698
impl Debug for MeshControllerActor {
@@ -642,6 +718,8 @@ impl Actor for MeshControllerActor {
642718
workers: None,
643719
history: History::new(world_size),
644720
id,
721+
debugger_active: None,
722+
debugger_paused: VecDeque::new(),
645723
})
646724
}
647725
async fn init(&mut self, this: &Instance<Self>) -> Result<(), anyhow::Error> {
@@ -681,8 +759,7 @@ impl Handler<ControllerMessage> for MeshControllerActor {
681759
debugger_actor_id,
682760
action,
683761
} => {
684-
let dm = crate::client::DebuggerMessage::new(debugger_actor_id.into(), action)?;
685-
panic!("NYI: debugger message handling");
762+
self.handle_debug(this, debugger_actor_id, action)?;
686763
}
687764
ControllerMessage::Status {
688765
seq,

python/monarch/_rust_bindings/monarch_extension/mesh_controller.pyi

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,6 @@ class _Controller:
3030
ranks: Union[NDSlice, List[NDSlice]],
3131
msg: NamedTuple,
3232
) -> None: ...
33-
def _debugger_attach(self, debugger_actor_id: ActorId) -> None: ...
34-
def _debugger_write(self, debugger_actor_id: ActorId, data: bytes) -> None: ...
3533
def _drain_and_stop(
3634
self,
3735
) -> List[client.LogMessage | client.WorkerResponse | client.DebuggerMessage]: ...

python/monarch/mesh_controller.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
import atexit
88
import logging
99
import os
10+
11+
import pdb # noqa
1012
import traceback
1113
from collections import deque
1214
from logging import Logger
@@ -23,7 +25,6 @@
2325
)
2426

2527
import torch.utils._python_dispatch
26-
2728
from monarch._rust_bindings.monarch_extension import client
2829
from monarch._rust_bindings.monarch_extension.client import ( # @manual=//monarch/monarch_extension:monarch_extension
2930
WorldState,
@@ -41,6 +42,8 @@
4142
from monarch.common.stream import StreamRef
4243
from monarch.common.tensor import Tensor
4344

45+
from monarch.tensor_worker_main import _set_trace
46+
4447
if TYPE_CHECKING:
4548
from monarch._rust_bindings.monarch_hyperactor.proc_mesh import (
4649
ProcMesh as HyProcMesh,
@@ -120,6 +123,7 @@ def _initialize_env(worker_point: Point, proc_id: str) -> None:
120123
"LOCAL_WORLD_SIZE": str(gpus_per_host),
121124
}
122125
os.environ.update(process_env)
126+
pdb.set_trace = _set_trace
123127
except Exception:
124128
traceback.print_exc()
125129
raise

0 commit comments

Comments
 (0)