Skip to content

Commit 0c1da0f

Browse files
authored
Merge pull request #6687 from yosefe/topic/osc-ucx-fix-ud-self-deadlock
OSC/UCX: Fix deadlock with atomic lock
2 parents 61adcd9 + 9d1994b commit 0c1da0f

File tree

3 files changed

+14
-6
lines changed

3 files changed

+14
-6
lines changed

ompi/mca/osc/ucx/osc_ucx_active_target.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,7 @@ int ompi_osc_ucx_post(struct ompi_group_t *group, int assert, struct ompi_win_t
272272
ompi_osc_ucx_handle_incoming_post(module, &(module->state.post_state[j]), NULL, 0);
273273
}
274274

275+
ucp_worker_progress(mca_osc_ucx_component.wpool->dflt_worker);
275276
usleep(100);
276277
} while (1);
277278
}

ompi/mca/osc/ucx/osc_ucx_comm.c

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -240,7 +240,7 @@ static inline int start_atomicity(ompi_osc_ucx_module_t *module, int target) {
240240
uint64_t remote_addr = (module->state_addrs)[target] + OSC_UCX_STATE_ACC_LOCK_OFFSET;
241241
int ret = OMPI_SUCCESS;
242242

243-
while (result_value != TARGET_LOCK_UNLOCKED) {
243+
for (;;) {
244244
ret = opal_common_ucx_wpmem_cmpswp(module->state_mem,
245245
TARGET_LOCK_UNLOCKED, TARGET_LOCK_EXCLUSIVE,
246246
target, &result_value, sizeof(result_value),
@@ -249,9 +249,12 @@ static inline int start_atomicity(ompi_osc_ucx_module_t *module, int target) {
249249
OSC_UCX_VERBOSE(1, "opal_common_ucx_mem_cmpswp failed: %d", ret);
250250
return OMPI_ERROR;
251251
}
252-
}
252+
if (result_value == TARGET_LOCK_UNLOCKED) {
253+
return OMPI_SUCCESS;
254+
}
253255

254-
return ret;
256+
ucp_worker_progress(mca_osc_ucx_component.wpool->dflt_worker);
257+
}
255258
}
256259

257260
static inline int end_atomicity(ompi_osc_ucx_module_t *module, int target) {

ompi/mca/osc/ucx/osc_ucx_passive_target.c

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ static inline int start_shared(ompi_osc_ucx_module_t *module, int target) {
4242
} else {
4343
break;
4444
}
45+
ucp_worker_progress(mca_osc_ucx_component.wpool->dflt_worker);
4546
}
4647

4748
return ret;
@@ -58,17 +59,20 @@ static inline int start_exclusive(ompi_osc_ucx_module_t *module, int target) {
5859
uint64_t remote_addr = (module->state_addrs)[target] + OSC_UCX_STATE_LOCK_OFFSET;
5960
int ret = OMPI_SUCCESS;
6061

61-
while (result_value != TARGET_LOCK_UNLOCKED) {
62+
for (;;) {
6263
ret = opal_common_ucx_wpmem_cmpswp(module->state_mem,
6364
TARGET_LOCK_UNLOCKED, TARGET_LOCK_EXCLUSIVE,
6465
target, &result_value, sizeof(result_value),
6566
remote_addr);
6667
if (OMPI_SUCCESS != ret) {
6768
return ret;
6869
}
69-
}
70+
if (result_value == TARGET_LOCK_UNLOCKED) {
71+
return OMPI_SUCCESS;
72+
}
7073

71-
return ret;
74+
ucp_worker_progress(mca_osc_ucx_component.wpool->dflt_worker);
75+
}
7276
}
7377

7478
static inline int end_exclusive(ompi_osc_ucx_module_t *module, int target) {

0 commit comments

Comments
 (0)