diff --git a/Cargo.toml b/Cargo.toml index 9c623529..93aaa71c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,6 +14,3 @@ members = [ "nccl-sys", "torch-sys", ] - -[profile.release] -incremental = true diff --git a/monarch_tensor_worker/src/stream.rs b/monarch_tensor_worker/src/stream.rs index 1ee4e7a7..c8d409c3 100644 --- a/monarch_tensor_worker/src/stream.rs +++ b/monarch_tensor_worker/src/stream.rs @@ -984,7 +984,7 @@ impl StreamActor { } let err = err.unwrap_dependent_error().unwrap_or(err); WorkerError { - backtrace: format!("{:?}", err), + backtrace: err.to_string(), worker_actor_id: worker_actor_id.clone(), } }) diff --git a/python/monarch/_src/actor/proc_mesh.py b/python/monarch/_src/actor/proc_mesh.py index d0c3fc79..511c4da6 100644 --- a/python/monarch/_src/actor/proc_mesh.py +++ b/python/monarch/_src/actor/proc_mesh.py @@ -336,7 +336,10 @@ async def proc_mesh_nonblocking( ) -> ProcMesh: if gpus is None: gpus = _local_device_count() - spec = AllocSpec(AllocConstraints(), gpus=gpus, hosts=hosts) + # gpus must come last in this order otherwise test_remote_function_all_gather + # because test_remote_function_all_gather expects that hosts comes before gpus + # in the order of the dimensions. + spec = AllocSpec(AllocConstraints(), hosts=hosts, gpus=gpus) env = env or {} cmd, args, base_env = _get_bootstrap_args() env.update(base_env) @@ -350,7 +353,10 @@ def proc_mesh_blocking( ) -> ProcMesh: if gpus is None: gpus = _local_device_count() - spec = AllocSpec(AllocConstraints(), gpus=gpus, hosts=hosts) + # gpus must come last in this order otherwise test_remote_function_all_gather + # because test_remote_function_all_gather expects that hosts comes before gpus + # in the order of the dimensions. + spec = AllocSpec(AllocConstraints(), hosts=hosts, gpus=gpus) env = env or {} cmd, args, base_env = _get_bootstrap_args() env.update(base_env) diff --git a/python/monarch/mesh_controller.py b/python/monarch/mesh_controller.py index 04225fe1..b36e37a9 100644 --- a/python/monarch/mesh_controller.py +++ b/python/monarch/mesh_controller.py @@ -124,6 +124,9 @@ def _initialize_env(worker_point: Point, proc_id: str) -> None: } os.environ.update(process_env) pdb.set_trace = _set_trace + # workaround for set_manual_seed somehow not working if cuda is not initialized\ + if torch.cuda.is_available(): + torch.cuda.init() except Exception: traceback.print_exc() raise @@ -248,7 +251,7 @@ def __str__(self): return ( f"A remote function has failed asynchronously on rank {self.rank}.\n" f"Traceback of where the remote function was issued on controller (most recent call last):\n{controller_tb}" - f"Error as reported from worker!!!!!!!:\n{self.worker_error_string}" + f"Error as reported from worker:\n{self.worker_error_string}" ) except Exception: traceback.print_exc() diff --git a/python/tests/test_python_actors.py b/python/tests/test_python_actors.py index db597bb9..7092844e 100644 --- a/python/tests/test_python_actors.py +++ b/python/tests/test_python_actors.py @@ -491,7 +491,7 @@ def _patch_output(msg): rank, coords, _, _, function, lineno = breakpoints[i] initial_linenos[rank] = lineno assert rank == i - assert coords == {"hosts": rank % 2, "gpus": rank // 2} + assert coords == {"hosts": rank // 2, "gpus": rank % 2} assert function == "test_python_actors._debugee_actor_internal" assert lineno == breakpoints[0][5] + 4 * rank diff --git a/python/tests/test_remote_functions.py b/python/tests/test_remote_functions.py index 7b1b3398..d085c45a 100644 --- a/python/tests/test_remote_functions.py +++ b/python/tests/test_remote_functions.py @@ -185,7 +185,9 @@ def local_device_mesh( # out is not counted as a failure, so we set a more restrictive timeout to # ensure we see a hard failure in CI. @pytest.mark.timeout(120) -@pytest.mark.parametrize("backend_type", [BackendType.PY, BackendType.RS]) +@pytest.mark.parametrize( + "backend_type", [BackendType.PY, BackendType.RS, BackendType.MESH] +) class TestRemoteFunctions(RemoteFunctionsTestBase): @classmethod def do_test_reduce_scatter_tensor(cls, backend_type, reduce_op, expected_tensor): @@ -952,10 +954,13 @@ def test_remote_function_failure_message_contains_traceback(self, backend_type): x = outer_remote_function_that_calls_inner() try: inspect(x) - except RemoteException as e: + except OldRemoteException as e: backtrace = "\n".join([frame.name for frame in e.worker_frames]) assert "outer_remote_function" in backtrace assert "inner_remote_function" in backtrace + except NewRemoteException as e: + assert "outer_remote_function" in e.worker_error_string + assert "inner_remote_function" in e.worker_error_string def test_remote_function_broadcast(self, backend_type): with self.local_device_mesh(2, 2, backend_type) as device_mesh: diff --git a/torch-sys/Cargo.toml b/torch-sys/Cargo.toml index aee7d187..f8529719 100644 --- a/torch-sys/Cargo.toml +++ b/torch-sys/Cargo.toml @@ -12,6 +12,7 @@ links = "torch" anyhow = "1.0.98" async-trait = "0.1.86" atomic_refcell = "0.1.13" +bincode = "1.3.3" cxx = "1.0.119" derive_more = { version = "1.0.0", features = ["full"] } monarch_types = { version = "0.0.0", path = "../monarch_types" } @@ -24,9 +25,6 @@ thiserror = "2.0.12" tokio = { version = "1.45.0", features = ["full", "test-util", "tracing"] } tracing = { version = "0.1.41", features = ["attributes", "valuable"] } -[dev-dependencies] -bincode = "1.3.3" - [build-dependencies] bindgen = "0.70.1" cxx-build = "1.0.119" diff --git a/torch-sys/src/ivalue.rs b/torch-sys/src/ivalue.rs index 592d58cd..34e2217f 100644 --- a/torch-sys/src/ivalue.rs +++ b/torch-sys/src/ivalue.rs @@ -150,7 +150,8 @@ impl Clone for OpaqueIValue { /// This creates a deep copy of the underlying data and can be expensive. /// It might also panic if the `IValue` is not cloneable. fn clone(&self) -> Self { - Self(ffi::ivalue_deepcopy(&self.0).unwrap()) + let serialized = bincode::serialize(&self.0).unwrap(); + bincode::deserialize(&serialized).unwrap() } }