From a345f6db023ac70d1618f4cb8ac74512316549f5 Mon Sep 17 00:00:00 2001 From: zdevito Date: Mon, 7 Jul 2025 17:19:27 -0700 Subject: [PATCH 1/3] [11/n] tensor engine, fix bugs to make test_remote_functions pass Fixes: * formatting of error messages * transposition of all_gathers, via a workaround. the real fix relies on fixing nccl-comm actor to respect the order of dimensions in process groups. * cloning of WireValue, which drops the is_wrapped_number property incorrectly. Differential Revision: [D77880706](https://our.internmc.facebook.com/intern/diff/D77880706/) [ghstack-poisoned] --- Cargo.toml | 3 --- monarch_tensor_worker/src/stream.rs | 2 +- python/monarch/mesh_controller.py | 4 +++- python/monarch/proc_mesh.py | 10 ++++++++-- python/tests/test_remote_functions.py | 9 +++++++-- torch-sys/Cargo.toml | 4 +--- torch-sys/src/ivalue.rs | 3 ++- 7 files changed, 22 insertions(+), 13 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 9c623529..93aaa71c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,6 +14,3 @@ members = [ "nccl-sys", "torch-sys", ] - -[profile.release] -incremental = true diff --git a/monarch_tensor_worker/src/stream.rs b/monarch_tensor_worker/src/stream.rs index c477b513..770514dd 100644 --- a/monarch_tensor_worker/src/stream.rs +++ b/monarch_tensor_worker/src/stream.rs @@ -984,7 +984,7 @@ impl StreamActor { } let err = err.unwrap_dependent_error().unwrap_or(err); WorkerError { - backtrace: format!("{:?}", err), + backtrace: err.to_string(), worker_actor_id: worker_actor_id.clone(), } }) diff --git a/python/monarch/mesh_controller.py b/python/monarch/mesh_controller.py index a21f3b76..b9dfed97 100644 --- a/python/monarch/mesh_controller.py +++ b/python/monarch/mesh_controller.py @@ -124,6 +124,8 @@ def _initialize_env(worker_point: Point, proc_id: str) -> None: } os.environ.update(process_env) pdb.set_trace = _set_trace + # workaround for set_manual_seed somehow not working if cuda is not initialized + torch.cuda.init() except Exception: traceback.print_exc() raise @@ -248,7 +250,7 @@ def __str__(self): return ( f"A remote function has failed asynchronously on rank {self.rank}.\n" f"Traceback of where the remote function was issued on controller (most recent call last):\n{controller_tb}" - f"Error as reported from worker!!!!!!!:\n{self.worker_error_string}" + f"Error as reported from worker:\n{self.worker_error_string}" ) except Exception: traceback.print_exc() diff --git a/python/monarch/proc_mesh.py b/python/monarch/proc_mesh.py index c947eb25..473e8758 100644 --- a/python/monarch/proc_mesh.py +++ b/python/monarch/proc_mesh.py @@ -299,7 +299,10 @@ async def proc_mesh_nonblocking( ) -> ProcMesh: if gpus is None: gpus = _local_device_count() - spec = AllocSpec(AllocConstraints(), gpus=gpus, hosts=hosts) + # gpus must come last in this order otherwise test_remote_function_all_gather + # because test_remote_function_all_gather expects that hosts comes before gpus + # in the order of the dimensions. + spec = AllocSpec(AllocConstraints(), hosts=hosts, gpus=gpus) env = env or {} cmd, args, base_env = _get_bootstrap_args() env.update(base_env) @@ -313,7 +316,10 @@ def proc_mesh_blocking( ) -> ProcMesh: if gpus is None: gpus = _local_device_count() - spec = AllocSpec(AllocConstraints(), gpus=gpus, hosts=hosts) + # gpus must come last in this order otherwise test_remote_function_all_gather + # because test_remote_function_all_gather expects that hosts comes before gpus + # in the order of the dimensions. + spec = AllocSpec(AllocConstraints(), hosts=hosts, gpus=gpus) env = env or {} cmd, args, base_env = _get_bootstrap_args() env.update(base_env) diff --git a/python/tests/test_remote_functions.py b/python/tests/test_remote_functions.py index 7b1b3398..d085c45a 100644 --- a/python/tests/test_remote_functions.py +++ b/python/tests/test_remote_functions.py @@ -185,7 +185,9 @@ def local_device_mesh( # out is not counted as a failure, so we set a more restrictive timeout to # ensure we see a hard failure in CI. @pytest.mark.timeout(120) -@pytest.mark.parametrize("backend_type", [BackendType.PY, BackendType.RS]) +@pytest.mark.parametrize( + "backend_type", [BackendType.PY, BackendType.RS, BackendType.MESH] +) class TestRemoteFunctions(RemoteFunctionsTestBase): @classmethod def do_test_reduce_scatter_tensor(cls, backend_type, reduce_op, expected_tensor): @@ -952,10 +954,13 @@ def test_remote_function_failure_message_contains_traceback(self, backend_type): x = outer_remote_function_that_calls_inner() try: inspect(x) - except RemoteException as e: + except OldRemoteException as e: backtrace = "\n".join([frame.name for frame in e.worker_frames]) assert "outer_remote_function" in backtrace assert "inner_remote_function" in backtrace + except NewRemoteException as e: + assert "outer_remote_function" in e.worker_error_string + assert "inner_remote_function" in e.worker_error_string def test_remote_function_broadcast(self, backend_type): with self.local_device_mesh(2, 2, backend_type) as device_mesh: diff --git a/torch-sys/Cargo.toml b/torch-sys/Cargo.toml index aee7d187..f8529719 100644 --- a/torch-sys/Cargo.toml +++ b/torch-sys/Cargo.toml @@ -12,6 +12,7 @@ links = "torch" anyhow = "1.0.98" async-trait = "0.1.86" atomic_refcell = "0.1.13" +bincode = "1.3.3" cxx = "1.0.119" derive_more = { version = "1.0.0", features = ["full"] } monarch_types = { version = "0.0.0", path = "../monarch_types" } @@ -24,9 +25,6 @@ thiserror = "2.0.12" tokio = { version = "1.45.0", features = ["full", "test-util", "tracing"] } tracing = { version = "0.1.41", features = ["attributes", "valuable"] } -[dev-dependencies] -bincode = "1.3.3" - [build-dependencies] bindgen = "0.70.1" cxx-build = "1.0.119" diff --git a/torch-sys/src/ivalue.rs b/torch-sys/src/ivalue.rs index 592d58cd..34e2217f 100644 --- a/torch-sys/src/ivalue.rs +++ b/torch-sys/src/ivalue.rs @@ -150,7 +150,8 @@ impl Clone for OpaqueIValue { /// This creates a deep copy of the underlying data and can be expensive. /// It might also panic if the `IValue` is not cloneable. fn clone(&self) -> Self { - Self(ffi::ivalue_deepcopy(&self.0).unwrap()) + let serialized = bincode::serialize(&self.0).unwrap(); + bincode::deserialize(&serialized).unwrap() } } From c196932e26bc3ad502887bab16c380be5a02f41a Mon Sep 17 00:00:00 2001 From: zdevito Date: Mon, 7 Jul 2025 17:20:46 -0700 Subject: [PATCH 2/3] Update on "[11/n] tensor engine, fix bugs to make test_remote_functions pass" Fixes: * formatting of error messages * transposition of all_gathers, via a workaround. the real fix relies on fixing nccl-comm actor to respect the order of dimensions in process groups. * cloning of WireValue, which drops the is_wrapped_number property incorrectly. Differential Revision: [D77880706](https://our.internmc.facebook.com/intern/diff/D77880706/) [ghstack-poisoned] --- python/monarch/mesh_controller.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/monarch/mesh_controller.py b/python/monarch/mesh_controller.py index b9dfed97..f985d5ac 100644 --- a/python/monarch/mesh_controller.py +++ b/python/monarch/mesh_controller.py @@ -124,8 +124,9 @@ def _initialize_env(worker_point: Point, proc_id: str) -> None: } os.environ.update(process_env) pdb.set_trace = _set_trace - # workaround for set_manual_seed somehow not working if cuda is not initialized - torch.cuda.init() + # workaround for set_manual_seed somehow not working if cuda is not initialized\ + if torch.cuda.is_available(): + torch.cuda.init() except Exception: traceback.print_exc() raise From fd2d0d05115e9afafb6ace538b596a5139e2cd34 Mon Sep 17 00:00:00 2001 From: zdevito Date: Tue, 8 Jul 2025 07:33:06 -0700 Subject: [PATCH 3/3] Update on "[11/n] tensor engine, fix bugs to make test_remote_functions pass" Fixes: * formatting of error messages * transposition of all_gathers, via a workaround. the real fix relies on fixing nccl-comm actor to respect the order of dimensions in process groups. * cloning of WireValue, which drops the is_wrapped_number property incorrectly. Differential Revision: [D77880706](https://our.internmc.facebook.com/intern/diff/D77880706/) [ghstack-poisoned] --- python/tests/test_python_actors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tests/test_python_actors.py b/python/tests/test_python_actors.py index b0124822..555b3a7e 100644 --- a/python/tests/test_python_actors.py +++ b/python/tests/test_python_actors.py @@ -492,7 +492,7 @@ def _patch_output(msg): rank, coords, _, _, function, lineno = breakpoints[i] initial_linenos[rank] = lineno assert rank == i - assert coords == {"hosts": rank % 2, "gpus": rank // 2} + assert coords == {"hosts": rank // 2, "gpus": rank % 2} assert function == "test_python_actors._debugee_actor_internal" assert lineno == breakpoints[0][5] + 4 * rank