Skip to content

Commit 6abab2a

Browse files
committed
ulfm/comm_mark_failed: fix a deadlock from checking a sentinel proc for liveness: when marking a proc as failed in a comm, we need to load-up the proc in the group array to avoid checking the sentinel later
Signed-off-by: Aurelien Bouteiller <bouteill@icl.utk.edu>
1 parent 548b488 commit 6abab2a

File tree

2 files changed

+17
-0
lines changed

2 files changed

+17
-0
lines changed

ompi/communicator/ft/comm_ft.c

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -334,11 +334,27 @@ bool ompi_comm_is_proc_active(ompi_communicator_t *comm, int peer_id, bool remot
334334
/* If the proc is not known yet (get_proc_ptr returns NULL for a valid
335335
* peer_id), then we assume that the proc is alive. When it is dead, the
336336
* proc will exist. */
337+
#if OPAL_ENABLE_DEBUG
338+
if(NULL == ompi_proc) {
339+
/* this debug has side effects on behavior/performance: it loads up the
340+
* proc for every query and may end-up being equivalent to the 'preconnect
341+
* all' option in the worse case. */
342+
ompi_proc = ompi_group_get_proc_ptr((remote ? comm->c_remote_group : comm->c_local_group),
343+
peer_id, true);
344+
assert(NULL != ompi_proc);
345+
assert(ompi_proc_is_active(ompi_proc));
346+
}
347+
#endif
337348
return (NULL == ompi_proc) ? true : ompi_proc_is_active(ompi_proc);
338349
}
339350

340351
int ompi_comm_set_rank_failed(ompi_communicator_t *comm, int peer_id, bool remote)
341352
{
353+
/* populate the proc in the comm's group array so that it is not a sentinel and can be read as failed */
354+
ompi_proc_t *ompi_proc = ompi_group_get_proc_ptr((remote ? comm->c_remote_group : comm->c_local_group),
355+
peer_id, true);
356+
assert(NULL != ompi_proc);
357+
342358
/* Disable ANY_SOURCE */
343359
comm->any_source_enabled = false;
344360
opal_atomic_wmb(); /* non-locked update needs a memory barrier to propagate */

ompi/proc/proc.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -471,6 +471,7 @@ static inline void ompi_proc_mark_as_failed(ompi_proc_t *proc) {
471471
abort();
472472
}
473473
proc->proc_active = false;
474+
opal_atomic_wmb(); /* non-locked update needs a memory barrier to propagate */
474475
}
475476
#endif /* OPAL_ENABLE_FT_MPI */
476477

0 commit comments

Comments
 (0)