Skip to content

Commit bdb6ece

Browse files
authored
Merge pull request #5452 from hoopoepg/topic/osc-ucx-fox-hang
OSC/UCX: fixed hang on OSC init
2 parents 2981249 + fa33e32 commit bdb6ece

File tree

2 files changed

+40
-23
lines changed

2 files changed

+40
-23
lines changed

ompi/mca/osc/ucx/osc_ucx.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ typedef struct ompi_osc_ucx_component {
3838
opal_free_list_t requests; /* request free list for the r* communication variants */
3939
bool env_initialized; /* UCX environment is initialized or not */
4040
int num_incomplete_req_ops;
41+
int num_modules;
4142
unsigned int priority;
4243
} ompi_osc_ucx_component_t;
4344

ompi/mca/osc/ucx/osc_ucx_component.c

Lines changed: 39 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ static int component_query(struct ompi_win_t *win, void **base, size_t size, int
2626
static int component_select(struct ompi_win_t *win, void **base, size_t size, int disp_unit,
2727
struct ompi_communicator_t *comm, struct opal_info_t *info,
2828
int flavor, int *model);
29+
static void ompi_osc_ucx_unregister_progress(void);
2930

3031
ompi_osc_ucx_component_t mca_osc_ucx_component = {
3132
{ /* ompi_osc_base_component_t */
@@ -45,7 +46,12 @@ ompi_osc_ucx_component_t mca_osc_ucx_component = {
4546
.osc_query = component_query,
4647
.osc_select = component_select,
4748
.osc_finalize = component_finalize,
48-
}
49+
},
50+
.ucp_context = NULL,
51+
.ucp_worker = NULL,
52+
.env_initialized = false,
53+
.num_incomplete_req_ops = 0,
54+
.num_modules = 0
4955
};
5056

5157
ompi_osc_ucx_module_t ompi_osc_ucx_module_template = {
@@ -105,24 +111,15 @@ static int component_register(void) {
105111
}
106112

107113
static int progress_callback(void) {
108-
if (mca_osc_ucx_component.ucp_worker != NULL &&
109-
mca_osc_ucx_component.num_incomplete_req_ops > 0) {
110-
ucp_worker_progress(mca_osc_ucx_component.ucp_worker);
111-
}
114+
ucp_worker_progress(mca_osc_ucx_component.ucp_worker);
112115
return 0;
113116
}
114117

115118
static int component_init(bool enable_progress_threads, bool enable_mpi_threads) {
116-
int ret = OMPI_SUCCESS;
117-
118-
mca_osc_ucx_component.ucp_context = NULL;
119-
mca_osc_ucx_component.ucp_worker = NULL;
120119
mca_osc_ucx_component.enable_mpi_threads = enable_mpi_threads;
121-
mca_osc_ucx_component.env_initialized = false;
122-
mca_osc_ucx_component.num_incomplete_req_ops = 0;
123120

124121
opal_common_ucx_mca_register();
125-
return ret;
122+
return OMPI_SUCCESS;
126123
}
127124

128125
static int component_finalize(void) {
@@ -141,7 +138,6 @@ static int component_finalize(void) {
141138
assert(mca_osc_ucx_component.num_incomplete_req_ops == 0);
142139
if (mca_osc_ucx_component.env_initialized == true) {
143140
OBJ_DESTRUCT(&mca_osc_ucx_component.requests);
144-
opal_progress_unregister(progress_callback);
145141
ucp_cleanup(mca_osc_ucx_component.ucp_context);
146142
mca_osc_ucx_component.env_initialized = false;
147143
}
@@ -241,6 +237,20 @@ static inline int mem_map(void **base, size_t size, ucp_mem_h *memh_ptr,
241237
return ret;
242238
}
243239

240+
static void ompi_osc_ucx_unregister_progress()
241+
{
242+
int ret;
243+
244+
mca_osc_ucx_component.num_modules--;
245+
OSC_UCX_ASSERT(mca_osc_ucx_component.num_modules >= 0);
246+
if (0 == mca_osc_ucx_component.num_modules) {
247+
ret = opal_progress_unregister(progress_callback);
248+
if (OMPI_SUCCESS != ret) {
249+
OSC_UCX_VERBOSE(1, "opal_progress_unregister failed: %d", ret);
250+
}
251+
}
252+
}
253+
244254
static int component_select(struct ompi_win_t *win, void **base, size_t size, int disp_unit,
245255
struct ompi_communicator_t *comm, struct opal_info_t *info,
246256
int flavor, int *model) {
@@ -251,7 +261,7 @@ static int component_select(struct ompi_win_t *win, void **base, size_t size, in
251261
ucs_status_t status;
252262
int i, comm_size = ompi_comm_size(comm);
253263
int is_eps_ready;
254-
bool progress_registered = false, eps_created = false, env_initialized = false;
264+
bool eps_created = false, env_initialized = false;
255265
ucp_address_t *my_addr = NULL;
256266
size_t my_addr_len;
257267
char *recv_buf = NULL;
@@ -328,13 +338,6 @@ static int component_select(struct ompi_win_t *win, void **base, size_t size, in
328338
goto error_nomem;
329339
}
330340

331-
ret = opal_progress_register(progress_callback);
332-
progress_registered = true;
333-
if (OMPI_SUCCESS != ret) {
334-
OSC_UCX_VERBOSE(1, "opal_progress_register failed: %d", ret);
335-
goto error;
336-
}
337-
338341
/* query UCP worker attributes */
339342
worker_attr.field_mask = UCP_WORKER_ATTR_FIELD_THREAD_MODE;
340343
status = ucp_worker_query(mca_osc_ucx_component.ucp_worker, &worker_attr);
@@ -362,6 +365,8 @@ static int component_select(struct ompi_win_t *win, void **base, size_t size, in
362365
goto error_nomem;
363366
}
364367

368+
mca_osc_ucx_component.num_modules++;
369+
365370
/* fill in the function pointer part */
366371
memcpy(module, &ompi_osc_ucx_module_template, sizeof(ompi_osc_base_module_t));
367372

@@ -616,6 +621,14 @@ static int component_select(struct ompi_win_t *win, void **base, size_t size, in
616621
goto error;
617622
}
618623

624+
OSC_UCX_ASSERT(mca_osc_ucx_component.num_modules > 0);
625+
if (1 == mca_osc_ucx_component.num_modules) {
626+
ret = opal_progress_register(progress_callback);
627+
if (OMPI_SUCCESS != ret) {
628+
OSC_UCX_VERBOSE(1, "opal_progress_register failed: %d", ret);
629+
goto error;
630+
}
631+
}
619632
return ret;
620633

621634
error:
@@ -643,8 +656,10 @@ static int component_select(struct ompi_win_t *win, void **base, size_t size, in
643656
ucp_ep_destroy(ep);
644657
}
645658
}
646-
if (progress_registered) opal_progress_unregister(progress_callback);
647-
if (module) free(module);
659+
if (module) {
660+
free(module);
661+
ompi_osc_ucx_unregister_progress();
662+
}
648663

649664
error_nomem:
650665
if (env_initialized == true) {
@@ -812,6 +827,7 @@ int ompi_osc_ucx_free(struct ompi_win_t *win) {
812827
ompi_comm_free(&module->comm);
813828

814829
free(module);
830+
ompi_osc_ucx_unregister_progress();
815831

816832
return ret;
817833
}

0 commit comments

Comments
 (0)