Skip to content

Commit d3cba3e

Browse files
authored
Merge pull request #8475 from awlauria/fix_rdma_accumulate_segv
Fix osc/rdma gcc warnings + possible race
2 parents e534c57 + 7856a25 commit d3cba3e

File tree

2 files changed

+27
-23
lines changed

2 files changed

+27
-23
lines changed

ompi/mca/osc/rdma/osc_rdma_accumulate.c

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -192,9 +192,9 @@ static int ompi_osc_rdma_fetch_and_op_cas (ompi_osc_rdma_sync_t *sync, const voi
192192
new_value = old_value;
193193

194194
if (&ompi_mpi_op_replace.op == op) {
195-
memcpy ((void *)((intptr_t) &new_value + offset), origin_addr + dt->super.true_lb, extent);
195+
memcpy ((void *)((intptr_t) &new_value + offset), (void *)((intptr_t) origin_addr + dt->super.true_lb), extent);
196196
} else if (&ompi_mpi_op_no_op.op != op) {
197-
ompi_op_reduce (op, (void *) origin_addr + dt->super.true_lb, (void*)((intptr_t) &new_value + offset), 1, dt);
197+
ompi_op_reduce (op, (void *) ((intptr_t) origin_addr + dt->super.true_lb), (void*)((intptr_t) &new_value + offset), 1, dt);
198198
}
199199

200200
ret = ompi_osc_rdma_btl_cswap (module, peer->data_endpoint, address, target_handle,
@@ -219,7 +219,7 @@ static int ompi_osc_rdma_acc_single_atomic (ompi_osc_rdma_sync_t *sync, const vo
219219
{
220220
ompi_osc_rdma_module_t *module = sync->module;
221221
int32_t atomic_flags = module->selected_btl->btl_atomic_flags;
222-
int ret, btl_op, flags;
222+
int btl_op, flags;
223223
int64_t origin;
224224

225225
if (!(module->selected_btl->btl_flags & MCA_BTL_FLAGS_ATOMIC_OPS)) {
@@ -261,7 +261,6 @@ static inline int ompi_osc_rdma_gacc_amo (ompi_osc_rdma_module_t *module, ompi_o
261261
const bool use_amo = module->acc_use_amo;
262262
const size_t dt_size = datatype->super.size;
263263
void *to_free = NULL;
264-
uint64_t offset = 0;
265264
int ret;
266265

267266
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "using network atomics for accumulate operation with count %d", count);
@@ -339,7 +338,7 @@ static inline int ompi_osc_rdma_gacc_contig (ompi_osc_rdma_sync_t *sync, const v
339338
/* if the datatype is small enough (and the count is 1) then try to directly use the hardware to execute
340339
* the atomic operation. this should be safe in all cases as either 1) the user has assured us they will
341340
* never use atomics with count > 1, 2) we have the accumulate lock, or 3) we have an exclusive lock */
342-
if (target_datatype->super.size <= 8 && target_count <= module->network_amo_max_count) {
341+
if ((target_datatype->super.size <= 8) && (((unsigned long) target_count) <= module->network_amo_max_count)) {
343342
ret = ompi_osc_rdma_gacc_amo (module, sync, source, result, result_count, result_datatype, result_convertor,
344343
peer, target_address, target_handle, target_count, target_datatype, op, request);
345344
if (OPAL_LIKELY(OMPI_SUCCESS == ret)) {
@@ -867,7 +866,7 @@ int ompi_osc_rdma_rget_accumulate_internal (ompi_win_t *win, const void *origin_
867866
ompi_osc_rdma_module_t *module = GET_MODULE(win);
868867
mca_btl_base_registration_handle_t *target_handle;
869868
uint64_t target_address;
870-
ptrdiff_t lb, target_lb, target_span;
869+
ptrdiff_t target_lb, target_span;
871870
ompi_osc_rdma_request_t *rdma_request = NULL;
872871
bool lock_acquired = false;
873872
ompi_osc_rdma_sync_t *sync;

ompi/mca/osc/rdma/osc_rdma_dynamic.c

Lines changed: 22 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,7 @@ int ompi_osc_rdma_attach (struct ompi_win_t *win, void *base, size_t len)
168168
ompi_osc_rdma_handle_t *rdma_region_handle;
169169
osc_rdma_counter_t region_count;
170170
osc_rdma_counter_t region_id;
171-
intptr_t bound, aligned_base, aligned_bound;
171+
intptr_t aligned_base, aligned_bound;
172172
intptr_t page_size = opal_getpagesize ();
173173
int region_index, ret;
174174
size_t aligned_len;
@@ -185,6 +185,7 @@ int ompi_osc_rdma_attach (struct ompi_win_t *win, void *base, size_t len)
185185
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "attach: %s, %p, %lu", win->w_name, base, (unsigned long) len);
186186

187187
OPAL_THREAD_LOCK(&module->lock);
188+
ompi_osc_rdma_lock_acquire_exclusive (module, my_peer, offsetof (ompi_osc_rdma_state_t, regions_lock));
188189

189190
region_count = module->state->region_count & 0xffffffffL;
190191
region_id = module->state->region_count >> 32;
@@ -197,7 +198,6 @@ int ompi_osc_rdma_attach (struct ompi_win_t *win, void *base, size_t len)
197198

198199
/* it is wasteful to register less than a page. this may allow the remote side to access more
199200
* memory but the MPI standard covers this with calling the calling behavior erroneous */
200-
bound = (intptr_t) base + len;
201201
aligned_bound = OPAL_ALIGN((intptr_t) base + len, page_size, intptr_t);
202202
aligned_base = (intptr_t) base & ~(page_size - 1);
203203
aligned_len = (size_t)(aligned_bound - aligned_base);
@@ -209,16 +209,11 @@ int ompi_osc_rdma_attach (struct ompi_win_t *win, void *base, size_t len)
209209
/* validates that the region does not overlap with an existing region even if they are on the same page */
210210
ret = ompi_osc_rdma_add_attachment (module->dynamic_handles[region_index], (intptr_t) base, len);
211211
OPAL_THREAD_UNLOCK(&module->lock);
212+
ompi_osc_rdma_lock_release_exclusive (module, my_peer, offsetof (ompi_osc_rdma_state_t, regions_lock));
212213
/* no need to invalidate remote caches */
213214
return ret;
214215
}
215216

216-
/* region is in flux */
217-
module->state->region_count = -1;
218-
opal_atomic_wmb ();
219-
220-
ompi_osc_rdma_lock_acquire_exclusive (module, my_peer, offsetof (ompi_osc_rdma_state_t, regions_lock));
221-
222217
/* do a binary seach for where the region should be inserted */
223218
if (region_count) {
224219
region = find_insertion_point ((ompi_osc_rdma_region_t *) module->state->regions, 0, region_count - 1,
@@ -239,7 +234,7 @@ int ompi_osc_rdma_attach (struct ompi_win_t *win, void *base, size_t len)
239234
region->len = aligned_len;
240235

241236
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "attaching dynamic memory region {%p, %p} aligned {%p, %p}, at index %d",
242-
base, (void *) bound, (void *) aligned_base, (void *) aligned_bound, region_index);
237+
base, (void *) ((intptr_t) base + len), (void *) aligned_base, (void *) aligned_bound, region_index);
243238

244239
/* add RDMA region handle to track this region */
245240
rdma_region_handle = OBJ_NEW(ompi_osc_rdma_handle_t);
@@ -253,6 +248,7 @@ int ompi_osc_rdma_attach (struct ompi_win_t *win, void *base, size_t len)
253248
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
254249
OPAL_THREAD_UNLOCK(&module->lock);
255250
OBJ_RELEASE(rdma_region_handle);
251+
ompi_osc_rdma_lock_release_exclusive (module, my_peer, offsetof (ompi_osc_rdma_state_t, regions_lock));
256252
return OMPI_ERR_RMA_ATTACH;
257253
}
258254

@@ -275,9 +271,9 @@ int ompi_osc_rdma_attach (struct ompi_win_t *win, void *base, size_t len)
275271
}
276272
#endif
277273

278-
opal_atomic_mb ();
279274
/* the region state has changed */
280275
module->state->region_count = ((region_id + 1) << 32) | (region_count + 1);
276+
opal_atomic_wmb ();
281277

282278
ompi_osc_rdma_lock_release_exclusive (module, my_peer, offsetof (ompi_osc_rdma_state_t, regions_lock));
283279
OPAL_THREAD_UNLOCK(&module->lock);
@@ -304,6 +300,9 @@ int ompi_osc_rdma_detach (struct ompi_win_t *win, const void *base)
304300

305301
OPAL_THREAD_LOCK(&module->lock);
306302

303+
/* lock the region so it can't change while a peer is reading it */
304+
ompi_osc_rdma_lock_acquire_exclusive (module, &my_peer->super, offsetof (ompi_osc_rdma_state_t, regions_lock));
305+
307306
/* the upper 4 bytes of the region count are an instance counter */
308307
region_count = module->state->region_count & 0xffffffffL;
309308
region_id = module->state->region_count >> 32;
@@ -327,23 +326,22 @@ int ompi_osc_rdma_detach (struct ompi_win_t *win, const void *base)
327326
if (region_index == region_count) {
328327
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "could not find dynamic memory attachment for %p", base);
329328
OPAL_THREAD_UNLOCK(&module->lock);
329+
ompi_osc_rdma_lock_release_exclusive (module, &my_peer->super, offsetof (ompi_osc_rdma_state_t, regions_lock));
330330
return OMPI_ERR_BASE;
331331
}
332332

333333
if (!opal_list_is_empty (&rdma_region_handle->attachments)) {
334334
/* another region is referencing this attachment */
335+
OPAL_THREAD_UNLOCK(&module->lock);
336+
ompi_osc_rdma_lock_release_exclusive (module, &my_peer->super, offsetof (ompi_osc_rdma_state_t, regions_lock));
335337
return OMPI_SUCCESS;
336338
}
337339

338-
/* lock the region so it can't change while a peer is reading it */
339-
ompi_osc_rdma_lock_acquire_exclusive (module, &my_peer->super, offsetof (ompi_osc_rdma_state_t, regions_lock));
340-
341340
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "detaching dynamic memory region {%p, %p} from index %d",
342341
base, (void *)((intptr_t) base + region->len), region_index);
343342

344343
if (module->selected_btl->btl_register_mem) {
345344
ompi_osc_rdma_deregister (module, rdma_region_handle->btl_handle);
346-
347345
}
348346

349347
if (region_index < region_count - 1) {
@@ -358,6 +356,7 @@ int ompi_osc_rdma_detach (struct ompi_win_t *win, const void *base)
358356
module->dynamic_handles[region_count - 1] = NULL;
359357

360358
module->state->region_count = ((region_id + 1) << 32) | (region_count - 1);
359+
opal_atomic_wmb ();
361360

362361
ompi_osc_rdma_lock_release_exclusive (module, &my_peer->super, offsetof (ompi_osc_rdma_state_t, regions_lock));
363362

@@ -461,14 +460,18 @@ int ompi_osc_rdma_find_dynamic_region (ompi_osc_rdma_module_t *module, ompi_osc_
461460
ompi_osc_rdma_peer_dynamic_t *dy_peer = (ompi_osc_rdma_peer_dynamic_t *) peer;
462461
intptr_t bound = (intptr_t) base + len;
463462
ompi_osc_rdma_region_t *regions;
464-
int ret, region_count;
463+
int ret = OMPI_SUCCESS, region_count;
465464

466465
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "locating dynamic memory region matching: {%" PRIx64 ", %" PRIx64 "}"
467466
" (len %lu)", base, base + len, (unsigned long) len);
468467

468+
OPAL_THREAD_LOCK(&module->lock);
469+
// Make sure region isn't being touched.
470+
ompi_osc_rdma_lock_acquire_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, regions_lock));
469471
if (!ompi_osc_rdma_peer_local_state (peer)) {
470472
ret = ompi_osc_rdma_refresh_dynamic_region (module, dy_peer);
471473
if (OMPI_SUCCESS != ret) {
474+
ompi_osc_rdma_lock_release_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, regions_lock));
472475
return ret;
473476
}
474477

@@ -482,9 +485,11 @@ int ompi_osc_rdma_find_dynamic_region (ompi_osc_rdma_module_t *module, ompi_osc_
482485

483486
*region = ompi_osc_rdma_find_region_containing (regions, 0, region_count - 1, (intptr_t) base, bound, module->region_size, NULL);
484487
if (!*region) {
485-
return OMPI_ERR_RMA_RANGE;
488+
ret = OMPI_ERR_RMA_RANGE;
486489
}
490+
OPAL_THREAD_UNLOCK(&module->lock);
491+
ompi_osc_rdma_lock_release_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, regions_lock));
487492

488493
/* round a matching region */
489-
return OMPI_SUCCESS;
494+
return ret;
490495
}

0 commit comments

Comments
 (0)