Skip to content

Commit b214594

Browse files
zdevitofacebook-github-bot
authored andcommitted
tensor engine, fix bugs to make test_remote_functions pass (#469)
Summary: Fixes: * formatting of error messages * transposition of all_gathers, via a workaround. the real fix relies on fixing nccl-comm actor to respect the order of dimensions in process groups. * cloning of WireValue, which drops the is_wrapped_number property incorrectly. ghstack-source-id: 295007765 exported-using-ghexport Reviewed By: colin2328, suo Differential Revision: D77880706
1 parent 27dab00 commit b214594

File tree

8 files changed

+24
-14
lines changed

8 files changed

+24
-14
lines changed

Cargo.toml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,3 @@ members = [
1414
"nccl-sys",
1515
"torch-sys",
1616
]
17-
18-
[profile.release]
19-
incremental = true

monarch_tensor_worker/src/stream.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -983,7 +983,7 @@ impl StreamActor {
983983
}
984984
let err = err.unwrap_dependent_error().unwrap_or(err);
985985
WorkerError {
986-
backtrace: format!("{:?}", err),
986+
backtrace: err.to_string(),
987987
worker_actor_id: worker_actor_id.clone(),
988988
}
989989
})

python/monarch/_src/actor/proc_mesh.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -336,7 +336,10 @@ async def proc_mesh_nonblocking(
336336
) -> ProcMesh:
337337
if gpus is None:
338338
gpus = _local_device_count()
339-
spec = AllocSpec(AllocConstraints(), gpus=gpus, hosts=hosts)
339+
# gpus must come last in this order otherwise test_remote_function_all_gather
340+
# because test_remote_function_all_gather expects that hosts comes before gpus
341+
# in the order of the dimensions.
342+
spec = AllocSpec(AllocConstraints(), hosts=hosts, gpus=gpus)
340343
env = env or {}
341344
cmd, args, base_env = _get_bootstrap_args()
342345
env.update(base_env)
@@ -350,7 +353,10 @@ def proc_mesh_blocking(
350353
) -> ProcMesh:
351354
if gpus is None:
352355
gpus = _local_device_count()
353-
spec = AllocSpec(AllocConstraints(), gpus=gpus, hosts=hosts)
356+
# gpus must come last in this order otherwise test_remote_function_all_gather
357+
# because test_remote_function_all_gather expects that hosts comes before gpus
358+
# in the order of the dimensions.
359+
spec = AllocSpec(AllocConstraints(), hosts=hosts, gpus=gpus)
354360
env = env or {}
355361
cmd, args, base_env = _get_bootstrap_args()
356362
env.update(base_env)

python/monarch/mesh_controller.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,9 @@ def _initialize_env(worker_point: Point, proc_id: str) -> None:
124124
}
125125
os.environ.update(process_env)
126126
pdb.set_trace = _set_trace
127+
# workaround for set_manual_seed somehow not working if cuda is not initialized\
128+
if torch.cuda.is_available():
129+
torch.cuda.init()
127130
except Exception:
128131
traceback.print_exc()
129132
raise
@@ -248,7 +251,7 @@ def __str__(self):
248251
return (
249252
f"A remote function has failed asynchronously on rank {self.rank}.\n"
250253
f"Traceback of where the remote function was issued on controller (most recent call last):\n{controller_tb}"
251-
f"Error as reported from worker!!!!!!!:\n{self.worker_error_string}"
254+
f"Error as reported from worker:\n{self.worker_error_string}"
252255
)
253256
except Exception:
254257
traceback.print_exc()

python/tests/test_python_actors.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -491,7 +491,7 @@ def _patch_output(msg):
491491
rank, coords, _, _, function, lineno = breakpoints[i]
492492
initial_linenos[rank] = lineno
493493
assert rank == i
494-
assert coords == {"hosts": rank % 2, "gpus": rank // 2}
494+
assert coords == {"hosts": rank // 2, "gpus": rank % 2}
495495
assert function == "test_python_actors._debugee_actor_internal"
496496
assert lineno == breakpoints[0][5] + 4 * rank
497497

python/tests/test_remote_functions.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,9 @@ def local_device_mesh(
185185
# out is not counted as a failure, so we set a more restrictive timeout to
186186
# ensure we see a hard failure in CI.
187187
@pytest.mark.timeout(120)
188-
@pytest.mark.parametrize("backend_type", [BackendType.PY, BackendType.RS])
188+
@pytest.mark.parametrize(
189+
"backend_type", [BackendType.PY, BackendType.RS, BackendType.MESH]
190+
)
189191
class TestRemoteFunctions(RemoteFunctionsTestBase):
190192
@classmethod
191193
def do_test_reduce_scatter_tensor(cls, backend_type, reduce_op, expected_tensor):
@@ -952,10 +954,13 @@ def test_remote_function_failure_message_contains_traceback(self, backend_type):
952954
x = outer_remote_function_that_calls_inner()
953955
try:
954956
inspect(x)
955-
except RemoteException as e:
957+
except OldRemoteException as e:
956958
backtrace = "\n".join([frame.name for frame in e.worker_frames])
957959
assert "outer_remote_function" in backtrace
958960
assert "inner_remote_function" in backtrace
961+
except NewRemoteException as e:
962+
assert "outer_remote_function" in e.worker_error_string
963+
assert "inner_remote_function" in e.worker_error_string
959964

960965
def test_remote_function_broadcast(self, backend_type):
961966
with self.local_device_mesh(2, 2, backend_type) as device_mesh:

torch-sys/Cargo.toml

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ links = "torch"
1212
anyhow = "1.0.98"
1313
async-trait = "0.1.86"
1414
atomic_refcell = "0.1.13"
15+
bincode = "1.3.3"
1516
cxx = "1.0.119"
1617
derive_more = { version = "1.0.0", features = ["full"] }
1718
monarch_types = { version = "0.0.0", path = "../monarch_types" }
@@ -24,9 +25,6 @@ thiserror = "2.0.12"
2425
tokio = { version = "1.45.0", features = ["full", "test-util", "tracing"] }
2526
tracing = { version = "0.1.41", features = ["attributes", "valuable"] }
2627

27-
[dev-dependencies]
28-
bincode = "1.3.3"
29-
3028
[build-dependencies]
3129
bindgen = "0.70.1"
3230
cxx-build = "1.0.119"

torch-sys/src/ivalue.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,8 @@ impl Clone for OpaqueIValue {
150150
/// This creates a deep copy of the underlying data and can be expensive.
151151
/// It might also panic if the `IValue` is not cloneable.
152152
fn clone(&self) -> Self {
153-
Self(ffi::ivalue_deepcopy(&self.0).unwrap())
153+
let serialized = bincode::serialize(&self.0).unwrap();
154+
bincode::deserialize(&serialized).unwrap()
154155
}
155156
}
156157

0 commit comments

Comments
 (0)