Skip to content

Commit dc68094

Browse files
committed
osc_ucx: fix hang/timeout in component finalize
Add barrier to make sure all endpoints are destroyed before destroying the worker. Signed-off-by: Yossi Itigin <yosefe@mellanox.com>
1 parent 98ad78e commit dc68094

File tree

1 file changed

+10
-0
lines changed

1 file changed

+10
-0
lines changed

ompi/mca/osc/ucx/osc_ucx_component.c

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,14 @@ static int component_init(bool enable_progress_threads, bool enable_mpi_threads)
127127
return OMPI_SUCCESS;
128128
}
129129

130+
static void component_world_barrier(void)
131+
{
132+
ompi_communicator_t *comm = &ompi_mpi_comm_world.comm;
133+
opal_progress_register(progress_callback);
134+
comm->c_coll->coll_barrier(comm, comm->c_coll->coll_barrier_module);
135+
opal_progress_unregister(progress_callback);
136+
}
137+
130138
static int component_finalize(void) {
131139
int i;
132140
for (i = 0; i < ompi_proc_world_size(); i++) {
@@ -136,7 +144,9 @@ static int component_finalize(void) {
136144
}
137145
}
138146

147+
assert(mca_osc_ucx_component.num_modules == 0);
139148
if (mca_osc_ucx_component.ucp_worker != NULL) {
149+
component_world_barrier();
140150
ucp_worker_destroy(mca_osc_ucx_component.ucp_worker);
141151
}
142152

0 commit comments

Comments
 (0)