pytorch-labs
diff --git a/‎monarch_extension/Cargo.toml
Lines changed: 1 addition & 0 deletions b/‎monarch_extension/Cargo.toml
Lines changed: 1 addition & 0 deletions
diff --git a/‎monarch_extension/src/lib.rs
Lines changed: 2 additions & 0 deletions b/‎monarch_extension/src/lib.rs
Lines changed: 2 additions & 0 deletions
diff --git a/‎monarch_hyperactor/Cargo.toml
Lines changed: 1 addition & 0 deletions b/‎monarch_hyperactor/Cargo.toml
Lines changed: 1 addition & 0 deletions
diff --git a/‎monarch_hyperactor/src/actor_mesh.rs
Lines changed: 1 addition & 1 deletion b/‎monarch_hyperactor/src/actor_mesh.rs
Lines changed: 1 addition & 1 deletion
diff --git a/‎monarch_hyperactor/src/mailbox.rs
Lines changed: 2 additions & 2 deletions b/‎monarch_hyperactor/src/mailbox.rs
Lines changed: 2 additions & 2 deletions
diff --git a/‎monarch_hyperactor/src/proc_mesh.rs
Lines changed: 50 additions & 10 deletions b/‎monarch_hyperactor/src/proc_mesh.rs
Lines changed: 50 additions & 10 deletions
diff --git a/‎monarch_rdma/extension/Cargo.toml
Lines changed: 20 additions & 0 deletions b/‎monarch_rdma/extension/Cargo.toml
Lines changed: 20 additions & 0 deletions
@@ -26,6 +26,7 @@ hyperactor_multiprocess = { version = "0.0.0", path = "../hyperactor_multiproces
 libc = "0.2.139"
 monarch_hyperactor = { version = "0.0.0", path = "../monarch_hyperactor" }
 monarch_messages = { version = "0.0.0", path = "../monarch_messages", optional = true }
+monarch_rdma_extension = { version = "0.0.0", path = "../monarch_rdma/extension" }
 monarch_simulator_lib = { version = "0.0.0", path = "../monarch_simulator", optional = true }
 monarch_tensor_worker = { version = "0.0.0", path = "../monarch_tensor_worker", optional = true }
 monarch_types = { version = "0.0.0", path = "../monarch_types" }
 
@@ -120,6 +120,7 @@ pub fn mod_init(module: &Bound<'_, PyModule>) -> PyResult<()> {
             module,
             "monarch_extension.mesh_controller",
         )?)?;
+        monarch_rdma_extension::register_python_bindings(&get_or_add_new_module(module, "rdma")?)?;
     }
     simulation_tools::register_python_bindings(&get_or_add_new_module(
         module,
@@ -166,6 +167,7 @@ pub fn mod_init(module: &Bound<'_, PyModule>) -> PyResult<()> {
         module,
         "monarch_hyperactor.runtime",
     )?)?;
+
     hyperactor_extension::alloc::register_python_bindings(&get_or_add_new_module(
         module,
         "hyperactor_extension.alloc",
 
@@ -20,6 +20,7 @@ hyperactor_mesh = { version = "0.0.0", path = "../hyperactor_mesh" }
 hyperactor_multiprocess = { version = "0.0.0", path = "../hyperactor_multiprocess" }
 hyperactor_telemetry = { version = "0.0.0", path = "../hyperactor_telemetry" }
 inventory = "0.3.8"
+monarch_rdma = { version = "0.0.0", path = "../monarch_rdma" }
 monarch_types = { version = "0.0.0", path = "../monarch_types" }
 ndslice = { version = "0.0.0", path = "../ndslice" }
 pyo3 = { version = "0.24", features = ["anyhow", "multiple-pymethods"] }
 
@@ -62,7 +62,7 @@ impl PythonActorMesh {
     }
 
     #[getter]
-    fn client(&self) -> PyMailbox {
+    pub fn client(&self) -> PyMailbox {
         self.client.clone()
     }
 
 
@@ -52,8 +52,8 @@ use crate::shape::PyShape;
     name = "Mailbox",
     module = "monarch._rust_bindings.monarch_hyperactor.mailbox"
 )]
-pub(super) struct PyMailbox {
-    pub(super) inner: Mailbox,
+pub struct PyMailbox {
+    pub inner: Mailbox,
 }
 
 #[pymethods]
 
@@ -28,6 +28,8 @@ use hyperactor_mesh::proc_mesh::SharedSpawnable;
 use hyperactor_mesh::shared_cell::SharedCell;
 use hyperactor_mesh::shared_cell::SharedCellPool;
 use hyperactor_mesh::shared_cell::SharedCellRef;
+use monarch_rdma::IbverbsConfig;
+use monarch_rdma::RdmaManagerActor;
 use monarch_types::PickledPyObject;
 use ndslice::Shape;
 use pyo3::IntoPyObjectExt;
@@ -116,9 +118,14 @@ pub struct PyProcMesh {
     proc_events: SharedCell<Mutex<ProcEvents>>,
     stop_monitor_sender: mpsc::Sender<bool>,
     user_monitor_registered: AtomicBool,
+    pub(super) rdma_manager: Option<SharedCell<RootActorMesh<'static, RdmaManagerActor>>>,
 }
 
-fn allocate_proc_mesh<'py>(py: Python<'py>, alloc: &PyAlloc) -> PyResult<Bound<'py, PyAny>> {
+fn allocate_proc_mesh<'py>(
+    py: Python<'py>,
+    alloc: &PyAlloc,
+    has_tensor_engine: Option<bool>,
+) -> PyResult<Bound<'py, PyAny>> {
     let alloc = match alloc.take() {
         Some(alloc) => alloc,
         None => {
@@ -132,11 +139,15 @@ fn allocate_proc_mesh<'py>(py: Python<'py>, alloc: &PyAlloc) -> PyResult<Bound<'
         let mesh = ProcMesh::allocate(alloc)
             .await
             .map_err(|err| PyException::new_err(err.to_string()))?;
-        Ok(PyProcMesh::monitored(mesh, world_id))
+        PyProcMesh::monitored(mesh, world_id, has_tensor_engine.unwrap_or(false)).await
     })
 }
 
-fn allocate_proc_mesh_blocking<'py>(py: Python<'py>, alloc: &PyAlloc) -> PyResult<PyProcMesh> {
+fn allocate_proc_mesh_blocking<'py>(
+    py: Python<'py>,
+    alloc: &PyAlloc,
+    has_tensor_engine: Option<bool>,
+) -> PyResult<PyProcMesh> {
     let alloc = match alloc.take() {
         Some(alloc) => alloc,
         None => {
@@ -150,14 +161,18 @@ fn allocate_proc_mesh_blocking<'py>(py: Python<'py>, alloc: &PyAlloc) -> PyResul
         let mesh = ProcMesh::allocate(alloc)
             .await
             .map_err(|err| PyException::new_err(err.to_string()))?;
-        Ok(PyProcMesh::monitored(mesh, world_id))
+        PyProcMesh::monitored(mesh, world_id, has_tensor_engine.unwrap_or(false)).await
     })?
 }
 
 impl PyProcMesh {
     /// Create a new [`PyProcMesh`] with a monitor that crashes the
     /// process on any proc failure.
-    fn monitored(mut proc_mesh: ProcMesh, world_id: WorldId) -> Self {
+    async fn monitored(
+        mut proc_mesh: ProcMesh,
+        world_id: WorldId,
+        has_tensor_engine: bool,
+    ) -> Result<Self, PyErr> {
         let (sender, abort_receiver) = mpsc::channel::<bool>(1);
         let proc_events = SharedCell::from(Mutex::new(proc_mesh.events().unwrap()));
         let monitor = tokio::spawn(Self::default_proc_mesh_monitor(
@@ -167,13 +182,34 @@ impl PyProcMesh {
             world_id,
             abort_receiver,
         ));
-        Self {
-            inner: SharedCell::from(TrackedProcMesh::from(proc_mesh)),
+
+        let tracked_proc_mesh = TrackedProcMesh::from(proc_mesh);
+
+        // Create optional RDMA manager
+        let rdma_manager = if has_tensor_engine && monarch_rdma::ibverbs_supported() {
+            // TODO - make this configurable
+            let config = IbverbsConfig::default();
+            tracing::debug!("rdma is enabled, using device {}", config.device);
+            let actor_mesh = tracked_proc_mesh
+                .spawn("rdma_manager", &config)
+                .await
+                .map_err(|err| PyException::new_err(err.to_string()))?;
+            Some(actor_mesh)
+        } else {
+            if has_tensor_engine {
+                tracing::info!("rdma is not enabled on this hardware");
+            }
+            None
+        };
+
+        Ok(Self {
+            inner: SharedCell::from(tracked_proc_mesh),
             keepalive: Keepalive::new(monitor),
             proc_events,
             stop_monitor_sender: sender,
             user_monitor_registered: AtomicBool::new(false),
-        }
+            rdma_manager,
+        })
     }
 
     /// The default monitor of the proc mesh for crashes. If a proc crashes, we print the reason
@@ -224,21 +260,25 @@ impl PyProcMesh {
 #[pymethods]
 impl PyProcMesh {
     #[classmethod]
+    #[pyo3(signature = (alloc, has_tensor_engine=false))]
     fn allocate_nonblocking<'py>(
         _cls: &Bound<'_, PyType>,
         py: Python<'py>,
         alloc: &PyAlloc,
+        has_tensor_engine: bool,
     ) -> PyResult<Bound<'py, PyAny>> {
-        allocate_proc_mesh(py, alloc)
+        allocate_proc_mesh(py, alloc, Some(has_tensor_engine))
     }
 
     #[classmethod]
+    #[pyo3(signature = (alloc, has_tensor_engine=false))]
     fn allocate_blocking<'py>(
         _cls: &Bound<'_, PyType>,
         py: Python<'py>,
         alloc: &PyAlloc,
+        has_tensor_engine: bool,
     ) -> PyResult<PyProcMesh> {
-        allocate_proc_mesh_blocking(py, alloc)
+        allocate_proc_mesh_blocking(py, alloc, Some(has_tensor_engine))
     }
 
     fn spawn_nonblocking<'py>(
 
@@ -0,0 +1,20 @@
+# @generated by autocargo from //monarch/monarch_rdma/extension:monarch_rdma_extension
+
+[package]
+name = "monarch_rdma_extension"
+version = "0.0.0"
+authors = ["Meta"]
+edition = "2021"
+license = "BSD-3-Clause"
+
+[lib]
+path = "lib.rs"
+
+[dependencies]
+hyperactor = { version = "0.0.0", path = "../../hyperactor" }
+monarch_hyperactor = { version = "0.0.0", path = "../../monarch_hyperactor" }
+monarch_rdma = { version = "0.0.0", path = ".." }
+pyo3 = { version = "0.24", features = ["anyhow", "multiple-pymethods"] }
+pyo3-async-runtimes = { version = "0.24", features = ["attributes", "tokio-runtime"] }
+serde = { version = "1.0.185", features = ["derive", "rc"] }
+serde_json = { version = "1.0.140", features = ["alloc", "float_roundtrip", "unbounded_depth"] }
Original file line number	Diff line number	Diff line change
`@@ -62,7 +62,7 @@ impl PythonActorMesh {`
`62`	`62`	`}`
`63`	`63`
`64`	`64`	`#[getter]`
`65`		`- fn client(&self) -> PyMailbox {`
	`65`	`+ pub fn client(&self) -> PyMailbox {`
`66`	`66`	`self.client.clone()`
`67`	`67`	`}`
`68`	`68`