Skip to content

Commit 3ccf7e3

Browse files
committed
osc/rdma: fix errors in derived datatype handling for accumulate
This commit fixes a number of bugs in the handling of derived datatypes when using MPI__Accumulate, MP_Get_accumulate, and MPI_Fetch_and_op. The following bugs are fixed: - Incorrect results when using MPI_OP_NO_OP with a 0 origin count. osc/rdma was not ignoring the source address, count, and datatype in the MPI_OP_NO_OP case as is required by the standard. - Correctly handle result_buffer=MPI_BOTTOM in ompi_osc_rdma_gacc_local(). - Test result_datatype in order to figure out whether a get is performed. References #6275 Signed-off-by: Gilles Gouaillardet <gilles@rist.or.jp> Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov>
1 parent 2c48deb commit 3ccf7e3

File tree

3 files changed

+73
-132
lines changed

3 files changed

+73
-132
lines changed

ompi/mca/osc/rdma/osc_rdma_accumulate.c

Lines changed: 67 additions & 120 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,12 @@
22
/*
33
* Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights
44
* reserved.
5-
* Copyright (c) 2016-2017 Research Organization for Information Science
6-
* and Technology (RIST). All rights reserved.
5+
* Copyright (c) 2016-2019 Research Organization for Information Science
6+
* and Technology (RIST). All rights reserved.
77
* Copyright (c) 2016-2018 Intel, Inc. All rights reserved.
88
* Copyright (c) 2019 Triad National Security, LLC. All rights
99
* reserved.
10-
* Copyright (c) 2019 Google, LLC. All rights reserved.
10+
* Copyright (c) 2019-2021 Google, LLC. All rights reserved.
1111
* $COPYRIGHT$
1212
*
1313
* Additional copyrights may follow
@@ -53,71 +53,6 @@ struct ompi_osc_rdma_event_t {
5353

5454
typedef struct ompi_osc_rdma_event_t ompi_osc_rdma_event_t;
5555

56-
#if 0
57-
static void *ompi_osc_rdma_event_put (int fd, int flags, void *context)
58-
{
59-
ompi_osc_rdma_event_t *event = (ompi_osc_rdma_event_t *) context;
60-
int ret;
61-
62-
ret = event->module->selected_btl->btl_put (event->module->selected_btl, event->endpoint, event->local_address,
63-
event->remote_address, event->local_handle, event->remote_handle,
64-
event->length, 0, MCA_BTL_NO_ORDER, event->cbfunc, event->cbcontext,
65-
event->cbdata);
66-
if (OPAL_LIKELY(OPAL_SUCCESS == ret)) {
67-
/* done with this event */
68-
opal_event_del (&event->super);
69-
free (event);
70-
} else {
71-
/* re-activate the event */
72-
opal_event_active (&event->super, OPAL_EV_READ, 1);
73-
}
74-
75-
return NULL;
76-
}
77-
78-
static int ompi_osc_rdma_event_queue (ompi_osc_rdma_module_t *module, struct mca_btl_base_endpoint_t *endpoint,
79-
ompi_osc_rdma_event_type_t event_type, void *local_address, mca_btl_base_registration_handle_t *local_handle,
80-
uint64_t remote_address, mca_btl_base_registration_handle_t *remote_handle,
81-
uint64_t length, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext,
82-
void *cbdata)
83-
{
84-
ompi_osc_rdma_event_t *event = malloc (sizeof (*event));
85-
void *(*event_func) (int, int, void *);
86-
87-
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "queueing event type %d", event_type);
88-
89-
if (OPAL_UNLIKELY(NULL == event)) {
90-
return OMPI_ERR_OUT_OF_RESOURCE;
91-
}
92-
93-
event->module = module;
94-
event->endpoint = endpoint;
95-
event->local_address = local_address;
96-
event->local_handle = local_handle;
97-
event->remote_address = remote_address;
98-
event->remote_handle = remote_handle;
99-
event->length = length;
100-
event->cbfunc = cbfunc;
101-
event->cbcontext = cbcontext;
102-
event->cbdata = cbdata;
103-
104-
switch (event_type) {
105-
case OMPI_OSC_RDMA_EVENT_TYPE_PUT:
106-
event_func = ompi_osc_rdma_event_put;
107-
break;
108-
default:
109-
opal_output(0, "osc/rdma: cannot queue unknown event type %d", event_type);
110-
abort ();
111-
}
112-
113-
opal_event_set (opal_sync_event_base, &event->super, -1, OPAL_EV_READ,
114-
event_func, event);
115-
opal_event_active (&event->super, OPAL_EV_READ, 1);
116-
117-
return OMPI_SUCCESS;
118-
}
119-
#endif
120-
12156
static int ompi_osc_rdma_gacc_local (const void *source_buffer, int source_count, ompi_datatype_t *source_datatype,
12257
void *result_buffer, int result_count, ompi_datatype_t *result_datatype,
12358
ompi_osc_rdma_peer_t *peer, uint64_t target_address,
@@ -130,7 +65,7 @@ static int ompi_osc_rdma_gacc_local (const void *source_buffer, int source_count
13065
do {
13166
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "performing accumulate with local region(s)");
13267

133-
if (NULL != result_buffer) {
68+
if (NULL != result_datatype) {
13469
/* get accumulate */
13570

13671
ret = ompi_datatype_sndrcv ((void *) (intptr_t) target_address, target_count, target_datatype,
@@ -187,7 +122,8 @@ static inline int ompi_osc_rdma_cas_local (const void *source_addr, const void *
187122

188123
static inline int ompi_osc_rdma_gacc_contig (ompi_osc_rdma_sync_t *sync, const void *source, int source_count,
189124
ompi_datatype_t *source_datatype, void *result, int result_count,
190-
ompi_datatype_t *result_datatype, ompi_osc_rdma_peer_t *peer, uint64_t target_address,
125+
ompi_datatype_t *result_datatype, opal_convertor_t *result_convertor,
126+
ompi_osc_rdma_peer_t *peer, uint64_t target_address,
191127
mca_btl_base_registration_handle_t *target_handle, int target_count,
192128
ompi_datatype_t *target_datatype, ompi_op_t *op, ompi_osc_rdma_request_t *request)
193129
{
@@ -222,8 +158,7 @@ static inline int ompi_osc_rdma_gacc_contig (ompi_osc_rdma_sync_t *sync, const v
222158
uint32_t iov_count = 1;
223159
size_t size = request->len;
224160

225-
opal_convertor_unpack (&request->convertor, &iov, &iov_count, &size);
226-
opal_convertor_cleanup (&request->convertor);
161+
opal_convertor_unpack (result_convertor, &iov, &iov_count, &size);
227162
} else {
228163
/* copy contiguous data to the result buffer */
229164
ompi_datatype_sndrcv (ptr, len, MPI_BYTE, result, result_count, result_datatype);
@@ -265,7 +200,7 @@ static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const v
265200
struct iovec source_iovec[OMPI_OSC_RDMA_DECODE_MAX], target_iovec[OMPI_OSC_RDMA_DECODE_MAX];
266201
const size_t acc_limit = (mca_osc_rdma_component.buffer_size >> 3);
267202
uint32_t source_primitive_count, target_primitive_count;
268-
opal_convertor_t source_convertor, target_convertor;
203+
opal_convertor_t source_convertor, target_convertor, result_convertor;
269204
uint32_t source_iov_count, target_iov_count;
270205
uint32_t source_iov_index, target_iov_index;
271206
ompi_datatype_t *source_primitive, *target_primitive;
@@ -282,6 +217,13 @@ static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const v
282217
request->internal = true;
283218
}
284219

220+
if (&ompi_mpi_op_no_op.op == op) {
221+
/* NTH: just zero these out to catch any coding errors (they should be ignored in the no-op case) */
222+
source_count = 0;
223+
source_datatype = NULL;
224+
source_addr = NULL;
225+
}
226+
285227
request->cleanup = ompi_osc_rdma_gacc_master_cleanup;
286228
request->type = result_datatype ? OMPI_OSC_RDMA_TYPE_GET_ACC : OMPI_OSC_RDMA_TYPE_ACC;
287229

@@ -304,7 +246,7 @@ static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const v
304246
}
305247

306248
ret = ompi_osc_rdma_gacc_contig (sync, source_addr, source_count, source_datatype, result_addr,
307-
result_count, result_datatype, peer, target_address,
249+
result_count, result_datatype, NULL, peer, target_address,
308250
target_handle, target_count, target_datatype, op,
309251
request);
310252
if (OPAL_LIKELY(OMPI_SUCCESS == ret)) {
@@ -358,6 +300,20 @@ static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const v
358300
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
359301
return ret;
360302
}
303+
source_iov_count = 0;
304+
} else {
305+
source_iovec[0].iov_len = (size_t) -1;
306+
source_iovec[0].iov_base = NULL;
307+
source_iov_count = 1;
308+
}
309+
310+
if (result_datatype) {
311+
OBJ_CONSTRUCT(&result_convertor, opal_convertor_t);
312+
ret = opal_convertor_copy_and_prepare_for_recv (ompi_mpi_local_convertor, &result_datatype->super, result_count, result_addr,
313+
0, &result_convertor);
314+
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
315+
return ret;
316+
}
361317
}
362318

363319
/* target_datatype can never be NULL */
@@ -373,59 +329,42 @@ static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const v
373329

374330
target_iov_index = 0;
375331
target_iov_count = 0;
332+
source_iov_index = 0;
376333
result_position = 0;
377334
subreq = NULL;
378335

379336
do {
380-
/* decode segments of the source data */
381-
source_iov_count = OMPI_OSC_RDMA_DECODE_MAX;
382-
source_iov_index = 0;
383-
/* opal_convertor_raw returns done when it has reached the end of the data */
384-
if (!source_datatype) {
385-
done = true;
386-
source_iovec[0].iov_len = (size_t) -1;
387-
source_iovec[0].iov_base = NULL;
388-
source_iov_count = 1;
389-
} else {
390-
done = opal_convertor_raw (&source_convertor, source_iovec, &source_iov_count, &source_size);
391-
}
392-
393-
/* loop on the target segments until we have exhaused the decoded source data */
394-
while (source_iov_index != source_iov_count) {
395-
if (target_iov_index == target_iov_count) {
396-
/* decode segments of the target buffer */
397-
target_iov_count = OMPI_OSC_RDMA_DECODE_MAX;
398-
target_iov_index = 0;
399-
(void) opal_convertor_raw (&target_convertor, target_iovec, &target_iov_count, &target_size);
337+
/* decode segments of the target buffer */
338+
target_iov_count = OMPI_OSC_RDMA_DECODE_MAX;
339+
target_iov_index = 0;
340+
done = opal_convertor_raw (&target_convertor, target_iovec, &target_iov_count, &target_size);
341+
342+
/* loop on the source segments (if any) until we have exhaused the decoded target data */
343+
while (target_iov_index != target_iov_count) {
344+
if (source_iov_count == source_iov_index) {
345+
/* decode segments of the source data */
346+
source_iov_count = OMPI_OSC_RDMA_DECODE_MAX;
347+
source_iov_index = 0;
348+
(void) opal_convertor_raw (&source_convertor, source_iovec, &source_iov_count, &source_size);
400349
}
401350

402351
/* we already checked that the target was large enough. this should be impossible */
403352
assert (0 != target_iov_count);
404353

405354
/* determine how much to put in this operation */
406-
acc_len = min(target_iovec[target_iov_index].iov_len, source_iovec[source_iov_index].iov_len);
407-
acc_len = min((size_t) acc_len, acc_limit);
355+
acc_len = min(min(target_iovec[target_iov_index].iov_len, source_iovec[source_iov_index].iov_len), acc_limit);
408356

409-
/* execute the get */
357+
/* execute the get-accumulate */
410358
if (!subreq) {
411359
OMPI_OSC_RDMA_REQUEST_ALLOC(module, peer, subreq);
412360
subreq->internal = true;
413361
subreq->parent_request = request;
362+
subreq->type = result_datatype ? OMPI_OSC_RDMA_TYPE_GET_ACC : OMPI_OSC_RDMA_TYPE_ACC;
414363
(void) OPAL_THREAD_ADD_FETCH32 (&request->outstanding_requests, 1);
415364
}
416365

417-
if (result_datatype) {
418-
/* prepare a convertor for this part of the result */
419-
opal_convertor_copy_and_prepare_for_recv (ompi_mpi_local_convertor, &result_datatype->super, result_count,
420-
result_addr, 0, &subreq->convertor);
421-
opal_convertor_set_position (&subreq->convertor, &result_position);
422-
subreq->type = OMPI_OSC_RDMA_TYPE_GET_ACC;
423-
} else {
424-
subreq->type = OMPI_OSC_RDMA_TYPE_ACC;
425-
}
426-
427366
ret = ompi_osc_rdma_gacc_contig (sync, source_iovec[source_iov_index].iov_base, acc_len / target_primitive->super.size,
428-
target_primitive, NULL, 0, NULL, peer,
367+
target_primitive, NULL, 0, NULL, &result_convertor, peer,
429368
(uint64_t) (intptr_t) target_iovec[target_iov_index].iov_base, target_handle,
430369
acc_len / target_primitive->super.size, target_primitive, op, subreq);
431370
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
@@ -445,13 +384,16 @@ static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const v
445384

446385
/* adjust io vectors */
447386
target_iovec[target_iov_index].iov_len -= acc_len;
448-
source_iovec[source_iov_index].iov_len -= acc_len;
449387
target_iovec[target_iov_index].iov_base = (void *)((intptr_t) target_iovec[target_iov_index].iov_base + acc_len);
450-
source_iovec[source_iov_index].iov_base = (void *)((intptr_t) source_iovec[source_iov_index].iov_base + acc_len);
388+
target_iov_index += (0 == target_iovec[target_iov_index].iov_len);
389+
451390
result_position += acc_len;
452391

453-
source_iov_index += !source_datatype || (0 == source_iovec[source_iov_index].iov_len);
454-
target_iov_index += (0 == target_iovec[target_iov_index].iov_len);
392+
if (source_datatype) {
393+
source_iov_index += (0 == source_iovec[source_iov_index].iov_len);
394+
source_iovec[source_iov_index].iov_len -= acc_len;
395+
source_iovec[source_iov_index].iov_base = (void *)((intptr_t) source_iovec[source_iov_index].iov_base + acc_len);
396+
}
455397
}
456398
} while (!done);
457399

@@ -463,6 +405,11 @@ static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const v
463405
OBJ_DESTRUCT(&source_convertor);
464406
}
465407

408+
if (result_datatype) {
409+
opal_convertor_cleanup (&result_convertor);
410+
OBJ_DESTRUCT(&result_convertor);
411+
}
412+
466413
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "finished scheduling rdma on non-contiguous datatype(s)");
467414

468415
opal_convertor_cleanup (&target_convertor);
@@ -589,9 +536,9 @@ static int ompi_osc_rdma_fetch_and_op_cas (ompi_osc_rdma_sync_t *sync, const voi
589536
new_value = old_value;
590537

591538
if (&ompi_mpi_op_replace.op == op) {
592-
memcpy ((void *)((intptr_t) &new_value + offset), origin_addr, extent);
539+
memcpy ((void *)((intptr_t) &new_value + offset), origin_addr + dt->super.true_lb, extent);
593540
} else if (&ompi_mpi_op_no_op.op != op) {
594-
ompi_op_reduce (op, (void *) origin_addr, (void*)((intptr_t) &new_value + offset), 1, dt);
541+
ompi_op_reduce (op, (void *) origin_addr + dt->super.true_lb, (void*)((intptr_t) &new_value + offset), 1, dt);
595542
}
596543

597544
ret = ompi_osc_rdma_btl_cswap (module, peer->data_endpoint, address, target_handle,
@@ -866,7 +813,7 @@ int ompi_osc_rdma_rget_accumulate_internal (ompi_osc_rdma_sync_t *sync, const vo
866813
ompi_osc_rdma_module_t *module = sync->module;
867814
mca_btl_base_registration_handle_t *target_handle;
868815
uint64_t target_address;
869-
ptrdiff_t lb, origin_extent, target_span;
816+
ptrdiff_t lb, target_lb, origin_extent, target_span;
870817
bool lock_acquired = false;
871818
int ret;
872819

@@ -879,11 +826,11 @@ int ompi_osc_rdma_rget_accumulate_internal (ompi_osc_rdma_sync_t *sync, const vo
879826
return OMPI_SUCCESS;
880827
}
881828

882-
target_span = opal_datatype_span(&target_datatype->super, target_count, &lb);
829+
target_span = opal_datatype_span(&target_datatype->super, target_count, &target_lb);
883830

884831
// a buffer defined by (buf, count, dt)
885832
// will have data starting at buf+offset and ending len bytes later:
886-
ret = osc_rdma_get_remote_segment (module, peer, target_disp, target_span+lb, &target_address, &target_handle);
833+
ret = osc_rdma_get_remote_segment (module, peer, target_disp, target_span+target_lb, &target_address, &target_handle);
887834
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
888835
return ret;
889836
}
@@ -916,10 +863,10 @@ int ompi_osc_rdma_rget_accumulate_internal (ompi_osc_rdma_sync_t *sync, const vo
916863
if (origin_extent <= 8 && 1 == origin_count && !use_shared_mem) {
917864
if (module->acc_use_amo && ompi_datatype_is_predefined (origin_datatype)) {
918865
if (NULL == result_addr) {
919-
ret = ompi_osc_rdma_acc_single_atomic (sync, origin_addr, origin_datatype, origin_extent, peer, target_address,
866+
ret = ompi_osc_rdma_acc_single_atomic (sync, origin_addr, origin_datatype, origin_extent, peer, target_address+target_lb,
920867
target_handle, op, request, lock_acquired);
921868
} else {
922-
ret = ompi_osc_rdma_fetch_and_op_atomic (sync, origin_addr, result_addr, origin_datatype, origin_extent, peer, target_address,
869+
ret = ompi_osc_rdma_fetch_and_op_atomic (sync, origin_addr, result_addr, origin_datatype, origin_extent, peer, target_address+target_lb,
923870
target_handle, op, request, lock_acquired);
924871
}
925872

@@ -928,7 +875,7 @@ int ompi_osc_rdma_rget_accumulate_internal (ompi_osc_rdma_sync_t *sync, const vo
928875
}
929876
}
930877

931-
ret = ompi_osc_rdma_fetch_and_op_cas (sync, origin_addr, result_addr, origin_datatype, origin_extent, peer, target_address,
878+
ret = ompi_osc_rdma_fetch_and_op_cas (sync, origin_addr, result_addr, origin_datatype, origin_extent, peer, target_address+target_lb,
932879
target_handle, op, request, lock_acquired);
933880
if (OMPI_SUCCESS == ret) {
934881
return OMPI_SUCCESS;

ompi/mca/osc/rdma/osc_rdma_request.c

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
* Copyright (c) 2016 The University of Tennessee and The University
77
* of Tennessee Research Foundation. All rights
88
* reserved.
9+
* Copyright (c) 2019 Triad National Security, LLC. All rights
10+
* reserved.
911
* $COPYRIGHT$
1012
*
1113
* Additional copyrights may follow
@@ -56,15 +58,7 @@ static void request_construct(ompi_osc_rdma_request_t *request)
5658
request->internal = false;
5759
request->cleanup = NULL;
5860
request->outstanding_requests = 0;
59-
OBJ_CONSTRUCT(&request->convertor, opal_convertor_t);
60-
}
61-
62-
static void request_destruct(ompi_osc_rdma_request_t *request)
63-
{
64-
OBJ_DESTRUCT(&request->convertor);
6561
}
6662

67-
OBJ_CLASS_INSTANCE(ompi_osc_rdma_request_t,
68-
ompi_request_t,
69-
request_construct,
70-
request_destruct);
63+
OBJ_CLASS_INSTANCE(ompi_osc_rdma_request_t, ompi_request_t,
64+
request_construct, NULL);

ompi/mca/osc/rdma/osc_rdma_request.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
* Copyright (c) 2012 Sandia National Laboratories. All rights reserved.
44
* Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights
55
* reserved.
6+
* Copyright (c) 2019 Triad National Security, LLC. All rights
7+
* reserved.
68
* $COPYRIGHT$
79
*
810
* Additional copyrights may follow
@@ -51,8 +53,6 @@ struct ompi_osc_rdma_request_t {
5153
uint64_t target_address;
5254

5355
struct ompi_osc_rdma_request_t *parent_request;
54-
/* used for non-contiguous get accumulate operations */
55-
opal_convertor_t convertor;
5656

5757
/** synchronization object */
5858
struct ompi_osc_rdma_sync_t *sync;

0 commit comments

Comments
 (0)