Skip to content

Commit cd9f3bc

Browse files
moonlifacebook-github-bot
authored andcommitted
mitigate the python proc mesh bug for KD controller service (#512)
Summary: Pull Request resolved: #512 A bug introduced by D77250211, which messed up the lifecycle management of python proc mesh. The Alloc of the proc mesh can be destroyed earlier than expected, resulting in connection disconnecting. This diff is a workaround, it keeps the proc meshes around, the same as the actor mesh. Differential Revision: D78111736 fbshipit-source-id: 51c943ebf52824aa499a0d1384d45f755cc9dcea
1 parent 814d48d commit cd9f3bc

File tree

1 file changed

+2
-0
lines changed

1 file changed

+2
-0
lines changed

hyperactor_mesh/src/proc_mesh.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,7 @@ impl ProcMesh {
184184
let (router_channel_addr, router_rx) = channel::serve(ChannelAddr::any(alloc.transport()))
185185
.await
186186
.map_err(|err| AllocatorError::Other(err.into()))?;
187+
tracing::info!("router channel started listening on addr: {router_channel_addr}");
187188
let router = DialMailboxRouter::new_with_default(global_router().boxed());
188189
for (rank, (addr, _agent)) in running.iter().enumerate() {
189190
let proc_id = proc_ids.get(rank).unwrap().clone();
@@ -202,6 +203,7 @@ impl ProcMesh {
202203
let (client_proc_addr, client_rx) = channel::serve(ChannelAddr::any(alloc.transport()))
203204
.await
204205
.map_err(|err| AllocatorError::Other(err.into()))?;
206+
tracing::info!("client proc started listening on addr: {client_proc_addr}");
205207
let client_proc = Proc::new(
206208
client_proc_id.clone(),
207209
BoxedMailboxSender::new(router.clone()),

0 commit comments

Comments
 (0)