From d00b1a9c5fbf3eeab9ed58b8361824af74b68fab Mon Sep 17 00:00:00 2001 From: suo Date: Sat, 5 Jul 2025 21:08:47 -0700 Subject: [PATCH 1/2] [monarch] refactor monarch/ callsites to use monarch.actor as title Differential Revision: [D77823401](https://our.internmc.facebook.com/intern/diff/D77823401/) [ghstack-poisoned] --- examples/grpo_actor.py | 3 +-- examples/notebooks/ping_pong.ipynb | 10 +++++----- examples/notebooks/spmd_ddp.ipynb | 4 ++-- monarch_extension/src/mesh_controller.rs | 1 - python/monarch/_testing.py | 2 +- python/monarch/actor/__init__.py | 13 ++++++++++--- python/monarch/mesh_controller.py | 2 +- python/tests/error_test_binary.py | 3 +-- python/tests/test_actor_error.py | 3 +-- python/tests/test_allocator.py | 10 ++++++++-- python/tests/test_tensor_engine.py | 2 +- 11 files changed, 31 insertions(+), 22 deletions(-) diff --git a/examples/grpo_actor.py b/examples/grpo_actor.py index 5992f473..52c5dde1 100644 --- a/examples/grpo_actor.py +++ b/examples/grpo_actor.py @@ -13,9 +13,8 @@ import torch import torch.nn as nn import torch.optim as optim -from monarch.actor_mesh import Actor, ActorMeshRef, endpoint -from monarch.proc_mesh import proc_mesh +from monarch.actor import Actor, ActorMeshRef, endpoint, proc_mesh from monarch.rdma import RDMABuffer from torch.distributions import Categorical, kl_divergence diff --git a/examples/notebooks/ping_pong.ipynb b/examples/notebooks/ping_pong.ipynb index e6ae862d..6675587e 100644 --- a/examples/notebooks/ping_pong.ipynb +++ b/examples/notebooks/ping_pong.ipynb @@ -16,7 +16,7 @@ "metadata": {}, "source": [ "## Hello World\n", - "Actors are spawned in Process meshes via the `monarch.proc_mesh` API. For those familiar with distributed systems, it can be helpful to think of each Actor as a server with endpoints that can be called." + "Actors are spawned in Process meshes via the `monarch.actor` API. For those familiar with distributed systems, it can be helpful to think of each Actor as a server with endpoints that can be called." ] }, { @@ -43,8 +43,8 @@ "source": [ "import asyncio\n", "\n", - "from monarch.proc_mesh import proc_mesh, ProcMesh\n", - "from monarch.actor_mesh import Actor, endpoint, current_rank\n", + "from monarch.actor import proc_mesh, ProcMesh\n", + "from monarch.actor import Actor, endpoint, current_rank\n", "\n", "NUM_ACTORS=4\n", "\n", @@ -168,8 +168,8 @@ "source": [ "import asyncio\n", "\n", - "from monarch.proc_mesh import proc_mesh, ProcMesh\n", - "from monarch.actor_mesh import Actor, endpoint, current_rank\n", + "from monarch.actor import proc_mesh, ProcMesh\n", + "from monarch.actor import Actor, endpoint, current_rank\n", "\n", "class ExampleActor(Actor):\n", " def __init__(self, actor_name):\n", diff --git a/examples/notebooks/spmd_ddp.ipynb b/examples/notebooks/spmd_ddp.ipynb index c9e930dd..6036bb42 100644 --- a/examples/notebooks/spmd_ddp.ipynb +++ b/examples/notebooks/spmd_ddp.ipynb @@ -24,8 +24,8 @@ "import torch.nn as nn\n", "import torch.optim as optim\n", "\n", - "from monarch.proc_mesh import proc_mesh\n", - "from monarch.actor_mesh import Actor, current_rank, endpoint\n", + "from monarch.actor import proc_mesh\n", + "from monarch.actor import Actor, current_rank, endpoint\n", "\n", "from torch.nn.parallel import DistributedDataParallel as DDP\n", "\n", diff --git a/monarch_extension/src/mesh_controller.rs b/monarch_extension/src/mesh_controller.rs index 3cafa131..52b59d92 100644 --- a/monarch_extension/src/mesh_controller.rs +++ b/monarch_extension/src/mesh_controller.rs @@ -30,7 +30,6 @@ use hyperactor::PortRef; use hyperactor::cap::CanSend; use hyperactor::mailbox::MailboxSenderError; use hyperactor_mesh::Mesh; -use hyperactor_mesh::ProcMesh; use hyperactor_mesh::actor_mesh::RootActorMesh; use hyperactor_mesh::shared_cell::SharedCell; use hyperactor_mesh::shared_cell::SharedCellRef; diff --git a/python/monarch/_testing.py b/python/monarch/_testing.py index de83b34a..619e526a 100644 --- a/python/monarch/_testing.py +++ b/python/monarch/_testing.py @@ -14,12 +14,12 @@ import monarch_supervisor from monarch.actor._shape import NDSlice +from monarch.actor import proc_mesh, ProcMesh from monarch.common.client import Client from monarch.common.device_mesh import DeviceMesh from monarch.common.invocation import DeviceException, RemoteException from monarch.controller.backend import ProcessBackend from monarch.mesh_controller import spawn_tensor_engine -from monarch.proc_mesh import proc_mesh, ProcMesh from monarch.python_local_mesh import PythonLocalContext from monarch.rust_local_mesh import ( local_mesh, diff --git a/python/monarch/actor/__init__.py b/python/monarch/actor/__init__.py index 943f21d1..26bb2dd9 100644 --- a/python/monarch/actor/__init__.py +++ b/python/monarch/actor/__init__.py @@ -12,27 +12,34 @@ Accumulator, Actor, ActorError, + ActorMeshRef, current_actor_name, current_rank, current_size, endpoint, MonarchContext, + Point, + send, ValueMesh, ) from monarch.actor._future import Future -from monarch.actor._proc_mesh import proc_mesh, ProcMesh +from monarch.actor._proc_mesh import local_proc_mesh, proc_mesh, ProcMesh __all__ = [ "Accumulator", "Actor", "ActorError", + "ActorMeshRef", "current_actor_name", "current_rank", "current_size", "endpoint", + "Future", + "local_proc_mesh", "MonarchContext", - "ValueMesh", + "Point", "proc_mesh", "ProcMesh", - "Future", + "send", + "ValueMesh", ] diff --git a/python/monarch/mesh_controller.py b/python/monarch/mesh_controller.py index cfe123d8..959f6f99 100644 --- a/python/monarch/mesh_controller.py +++ b/python/monarch/mesh_controller.py @@ -34,7 +34,7 @@ ActorId, ) from monarch.actor._shape import NDSlice -from monarch.actor_mesh import Port, PortTuple +from monarch.actor._actor_mesh import Port, PortTuple from monarch.common import messages from monarch.common.controller_api import TController from monarch.common.invocation import Seq diff --git a/python/tests/error_test_binary.py b/python/tests/error_test_binary.py index a3a4ff78..524567d4 100644 --- a/python/tests/error_test_binary.py +++ b/python/tests/error_test_binary.py @@ -13,8 +13,7 @@ from monarch._rust_bindings.monarch_extension.panic import panicking_function -from monarch.actor_mesh import Actor, endpoint, send -from monarch.proc_mesh import proc_mesh +from monarch.actor import Actor, endpoint, proc_mesh, send class ErrorActor(Actor): diff --git a/python/tests/test_actor_error.py b/python/tests/test_actor_error.py index ed0b269a..b51a471b 100644 --- a/python/tests/test_actor_error.py +++ b/python/tests/test_actor_error.py @@ -10,8 +10,7 @@ import pytest from monarch._rust_bindings.monarch_hyperactor.proc_mesh import ProcEvent -from monarch.actor_mesh import Actor, ActorError, endpoint, send -from monarch.proc_mesh import local_proc_mesh, proc_mesh +from monarch.actor import Actor, ActorError, endpoint, local_proc_mesh, proc_mesh class ExceptionActor(Actor): diff --git a/python/tests/test_allocator.py b/python/tests/test_allocator.py index e9d9bb9d..4623f56a 100644 --- a/python/tests/test_allocator.py +++ b/python/tests/test_allocator.py @@ -32,14 +32,20 @@ ChannelAddr, ChannelTransport, ) +from monarch.actor import ( + Actor, + current_rank, + current_size, + endpoint, + ProcMesh, + ValueMesh, +) from monarch.actor._allocator import ( ALLOC_LABEL_PROC_MESH_NAME, RemoteAllocator, StaticRemoteAllocInitializer, TorchXRemoteAllocInitializer, ) -from monarch.actor_mesh import Actor, current_rank, current_size, endpoint, ValueMesh -from monarch.proc_mesh import ProcMesh from monarch.tools.mesh_spec import MeshSpec, ServerSpec from monarch.tools.network import get_sockaddr diff --git a/python/tests/test_tensor_engine.py b/python/tests/test_tensor_engine.py index 6098c32c..a7e5122b 100644 --- a/python/tests/test_tensor_engine.py +++ b/python/tests/test_tensor_engine.py @@ -8,8 +8,8 @@ import pytest import torch from monarch import remote +from monarch.actor import proc_mesh from monarch.mesh_controller import spawn_tensor_engine -from monarch.proc_mesh import proc_mesh two_gpu = pytest.mark.skipif( From 2feaa15bdae6504ae444fa2ee54157459690dd69 Mon Sep 17 00:00:00 2001 From: suo Date: Sat, 5 Jul 2025 21:29:25 -0700 Subject: [PATCH 2/2] Update on "[monarch] refactor monarch/ callsites to use monarch.actor" as title Differential Revision: [D77823401](https://our.internmc.facebook.com/intern/diff/D77823401/) [ghstack-poisoned] --- python/monarch/_testing.py | 2 +- python/monarch/actor/_proc_mesh.py | 23 ++++++++++++++--------- python/monarch/mesh_controller.py | 2 +- setup.py | 2 +- 4 files changed, 17 insertions(+), 12 deletions(-) diff --git a/python/monarch/_testing.py b/python/monarch/_testing.py index 619e526a..2925b8aa 100644 --- a/python/monarch/_testing.py +++ b/python/monarch/_testing.py @@ -13,8 +13,8 @@ from typing import Any, Callable, Dict, Generator, Literal, Optional import monarch_supervisor -from monarch.actor._shape import NDSlice from monarch.actor import proc_mesh, ProcMesh +from monarch.actor._shape import NDSlice from monarch.common.client import Client from monarch.common.device_mesh import DeviceMesh from monarch.common.invocation import DeviceException, RemoteException diff --git a/python/monarch/actor/_proc_mesh.py b/python/monarch/actor/_proc_mesh.py index 80c6aaaa..ccfd4799 100644 --- a/python/monarch/actor/_proc_mesh.py +++ b/python/monarch/actor/_proc_mesh.py @@ -22,8 +22,6 @@ TypeVar, ) -from monarch._rust_bindings import has_tensor_engine - from monarch._rust_bindings.hyperactor_extension.alloc import ( # @manual=//monarch/monarch_extension:monarch_extension # @manual=//monarch/monarch_extension:monarch_extension Alloc, AllocConstraints, @@ -44,11 +42,21 @@ from monarch.actor._future import Future from monarch.actor._shape import MeshTrait +HAS_TENSOR_ENGINE = False +try: + # TODO: while the tensor_engine submodule doesn't exist yet, use the + # available of monarch.rdma as a proxy. + # type: ignore + from monarch.rdma import RDMAManager # @manual + + HAS_TENSOR_ENGINE = True +except ImportError: + pass + if TYPE_CHECKING: Tensor = Any DeviceMesh = Any - RDMAManager = Any T = TypeVar("T") @@ -83,10 +91,7 @@ def __init__( self._rsync_mesh_client: Optional[RsyncMeshClient] = None self._auto_reload_actor: Optional[AutoReloadActor] = None self._maybe_device_mesh: Optional["DeviceMesh"] = _device_mesh - if _mock_shape is None and has_tensor_engine(): - # type: ignore[21] - from monarch.rdma import RDMAManager # @manual - + if _mock_shape is None and HAS_TENSOR_ENGINE: # type: ignore[21] self._rdma_manager = self._spawn_blocking("rdma_manager", RDMAManager) @@ -191,7 +196,7 @@ async def _spawn_nonblocking( @property def _device_mesh(self) -> "DeviceMesh": - if not has_tensor_engine(): + if not HAS_TENSOR_ENGINE: raise RuntimeError( "DeviceMesh is not available because tensor_engine was not compiled (USE_TENSOR_ENGINE=0)" ) @@ -280,7 +285,7 @@ def local_proc_mesh(*, gpus: Optional[int] = None, hosts: int = 1) -> Future[Pro ) -_BOOTSTRAP_MAIN = "monarch.bootstrap_main" +_BOOTSTRAP_MAIN = "monarch.actor._bootstrap_main" def _get_bootstrap_args() -> tuple[str, Optional[list[str]], dict[str, str]]: diff --git a/python/monarch/mesh_controller.py b/python/monarch/mesh_controller.py index 959f6f99..015a5e50 100644 --- a/python/monarch/mesh_controller.py +++ b/python/monarch/mesh_controller.py @@ -33,8 +33,8 @@ from monarch._rust_bindings.monarch_hyperactor.proc import ( # @manual=//monarch/monarch_extension:monarch_extension ActorId, ) -from monarch.actor._shape import NDSlice from monarch.actor._actor_mesh import Port, PortTuple +from monarch.actor._shape import NDSlice from monarch.common import messages from monarch.common.controller_api import TController from monarch.common.invocation import Seq diff --git a/setup.py b/setup.py index cd509910..35ccd6f4 100644 --- a/setup.py +++ b/setup.py @@ -166,7 +166,7 @@ def run(self): entry_points={ "console_scripts": [ "monarch=monarch.tools.cli:main", - "monarch_bootstrap=monarch.bootstrap_main:invoke_main", + "monarch_bootstrap=monarch.actor._bootstrap_main:invoke_main", ], }, rust_extensions=rust_extensions,