Skip to content

Commit 9d1994b

Browse files
committed
OSC/UCX: Fix deadlock with atomic lock
Atomic lock must progress local worker while obtaining the remote lock, otherwise an active message which actually releases the lock might not be processed while polling on local memory location. Signed-off-by: Yossi Itigin <yosefe@mellanox.com>
1 parent 61adcd9 commit 9d1994b

File tree

3 files changed

+14
-6
lines changed

3 files changed

+14
-6
lines changed

ompi/mca/osc/ucx/osc_ucx_active_target.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,7 @@ int ompi_osc_ucx_post(struct ompi_group_t *group, int assert, struct ompi_win_t
272272
ompi_osc_ucx_handle_incoming_post(module, &(module->state.post_state[j]), NULL, 0);
273273
}
274274

275+
ucp_worker_progress(mca_osc_ucx_component.wpool->dflt_worker);
275276
usleep(100);
276277
} while (1);
277278
}

ompi/mca/osc/ucx/osc_ucx_comm.c

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -240,7 +240,7 @@ static inline int start_atomicity(ompi_osc_ucx_module_t *module, int target) {
240240
uint64_t remote_addr = (module->state_addrs)[target] + OSC_UCX_STATE_ACC_LOCK_OFFSET;
241241
int ret = OMPI_SUCCESS;
242242

243-
while (result_value != TARGET_LOCK_UNLOCKED) {
243+
for (;;) {
244244
ret = opal_common_ucx_wpmem_cmpswp(module->state_mem,
245245
TARGET_LOCK_UNLOCKED, TARGET_LOCK_EXCLUSIVE,
246246
target, &result_value, sizeof(result_value),
@@ -249,9 +249,12 @@ static inline int start_atomicity(ompi_osc_ucx_module_t *module, int target) {
249249
OSC_UCX_VERBOSE(1, "opal_common_ucx_mem_cmpswp failed: %d", ret);
250250
return OMPI_ERROR;
251251
}
252-
}
252+
if (result_value == TARGET_LOCK_UNLOCKED) {
253+
return OMPI_SUCCESS;
254+
}
253255

254-
return ret;
256+
ucp_worker_progress(mca_osc_ucx_component.wpool->dflt_worker);
257+
}
255258
}
256259

257260
static inline int end_atomicity(ompi_osc_ucx_module_t *module, int target) {

ompi/mca/osc/ucx/osc_ucx_passive_target.c

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ static inline int start_shared(ompi_osc_ucx_module_t *module, int target) {
4242
} else {
4343
break;
4444
}
45+
ucp_worker_progress(mca_osc_ucx_component.wpool->dflt_worker);
4546
}
4647

4748
return ret;
@@ -58,17 +59,20 @@ static inline int start_exclusive(ompi_osc_ucx_module_t *module, int target) {
5859
uint64_t remote_addr = (module->state_addrs)[target] + OSC_UCX_STATE_LOCK_OFFSET;
5960
int ret = OMPI_SUCCESS;
6061

61-
while (result_value != TARGET_LOCK_UNLOCKED) {
62+
for (;;) {
6263
ret = opal_common_ucx_wpmem_cmpswp(module->state_mem,
6364
TARGET_LOCK_UNLOCKED, TARGET_LOCK_EXCLUSIVE,
6465
target, &result_value, sizeof(result_value),
6566
remote_addr);
6667
if (OMPI_SUCCESS != ret) {
6768
return ret;
6869
}
69-
}
70+
if (result_value == TARGET_LOCK_UNLOCKED) {
71+
return OMPI_SUCCESS;
72+
}
7073

71-
return ret;
74+
ucp_worker_progress(mca_osc_ucx_component.wpool->dflt_worker);
75+
}
7276
}
7377

7478
static inline int end_exclusive(ompi_osc_ucx_module_t *module, int target) {

0 commit comments

Comments
 (0)