2
2
/*
3
3
* Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights
4
4
* reserved.
5
- * Copyright (c) 2016-2017 Research Organization for Information Science
6
- * and Technology (RIST). All rights reserved.
5
+ * Copyright (c) 2016-2019 Research Organization for Information Science
6
+ * and Technology (RIST). All rights reserved.
7
7
* Copyright (c) 2016-2018 Intel, Inc. All rights reserved.
8
8
* Copyright (c) 2019 Triad National Security, LLC. All rights
9
9
* reserved.
10
- * Copyright (c) 2019 Google, LLC. All rights reserved.
10
+ * Copyright (c) 2019-2021 Google, LLC. All rights reserved.
11
11
* $COPYRIGHT$
12
12
*
13
13
* Additional copyrights may follow
@@ -53,71 +53,6 @@ struct ompi_osc_rdma_event_t {
53
53
54
54
typedef struct ompi_osc_rdma_event_t ompi_osc_rdma_event_t ;
55
55
56
- #if 0
57
- static void * ompi_osc_rdma_event_put (int fd , int flags , void * context )
58
- {
59
- ompi_osc_rdma_event_t * event = (ompi_osc_rdma_event_t * ) context ;
60
- int ret ;
61
-
62
- ret = event -> module -> selected_btl -> btl_put (event -> module -> selected_btl , event -> endpoint , event -> local_address ,
63
- event -> remote_address , event -> local_handle , event -> remote_handle ,
64
- event -> length , 0 , MCA_BTL_NO_ORDER , event -> cbfunc , event -> cbcontext ,
65
- event -> cbdata );
66
- if (OPAL_LIKELY (OPAL_SUCCESS == ret )) {
67
- /* done with this event */
68
- opal_event_del (& event -> super );
69
- free (event );
70
- } else {
71
- /* re-activate the event */
72
- opal_event_active (& event -> super , OPAL_EV_READ , 1 );
73
- }
74
-
75
- return NULL ;
76
- }
77
-
78
- static int ompi_osc_rdma_event_queue (ompi_osc_rdma_module_t * module , struct mca_btl_base_endpoint_t * endpoint ,
79
- ompi_osc_rdma_event_type_t event_type , void * local_address , mca_btl_base_registration_handle_t * local_handle ,
80
- uint64_t remote_address , mca_btl_base_registration_handle_t * remote_handle ,
81
- uint64_t length , mca_btl_base_rdma_completion_fn_t cbfunc , void * cbcontext ,
82
- void * cbdata )
83
- {
84
- ompi_osc_rdma_event_t * event = malloc (sizeof (* event ));
85
- void * (* event_func ) (int , int , void * );
86
-
87
- OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_TRACE , "queueing event type %d" , event_type );
88
-
89
- if (OPAL_UNLIKELY (NULL == event )) {
90
- return OMPI_ERR_OUT_OF_RESOURCE ;
91
- }
92
-
93
- event -> module = module ;
94
- event -> endpoint = endpoint ;
95
- event -> local_address = local_address ;
96
- event -> local_handle = local_handle ;
97
- event -> remote_address = remote_address ;
98
- event -> remote_handle = remote_handle ;
99
- event -> length = length ;
100
- event -> cbfunc = cbfunc ;
101
- event -> cbcontext = cbcontext ;
102
- event -> cbdata = cbdata ;
103
-
104
- switch (event_type ) {
105
- case OMPI_OSC_RDMA_EVENT_TYPE_PUT :
106
- event_func = ompi_osc_rdma_event_put ;
107
- break ;
108
- default :
109
- opal_output (0 , "osc/rdma: cannot queue unknown event type %d" , event_type );
110
- abort ();
111
- }
112
-
113
- opal_event_set (opal_sync_event_base , & event -> super , -1 , OPAL_EV_READ ,
114
- event_func , event );
115
- opal_event_active (& event -> super , OPAL_EV_READ , 1 );
116
-
117
- return OMPI_SUCCESS ;
118
- }
119
- #endif
120
-
121
56
static int ompi_osc_rdma_gacc_local (const void * source_buffer , int source_count , ompi_datatype_t * source_datatype ,
122
57
void * result_buffer , int result_count , ompi_datatype_t * result_datatype ,
123
58
ompi_osc_rdma_peer_t * peer , uint64_t target_address ,
@@ -130,7 +65,7 @@ static int ompi_osc_rdma_gacc_local (const void *source_buffer, int source_count
130
65
do {
131
66
OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_TRACE , "performing accumulate with local region(s)" );
132
67
133
- if (NULL != result_buffer ) {
68
+ if (NULL != result_datatype ) {
134
69
/* get accumulate */
135
70
136
71
ret = ompi_datatype_sndrcv ((void * ) (intptr_t ) target_address , target_count , target_datatype ,
@@ -187,7 +122,8 @@ static inline int ompi_osc_rdma_cas_local (const void *source_addr, const void *
187
122
188
123
static inline int ompi_osc_rdma_gacc_contig (ompi_osc_rdma_sync_t * sync , const void * source , int source_count ,
189
124
ompi_datatype_t * source_datatype , void * result , int result_count ,
190
- ompi_datatype_t * result_datatype , ompi_osc_rdma_peer_t * peer , uint64_t target_address ,
125
+ ompi_datatype_t * result_datatype , opal_convertor_t * result_convertor ,
126
+ ompi_osc_rdma_peer_t * peer , uint64_t target_address ,
191
127
mca_btl_base_registration_handle_t * target_handle , int target_count ,
192
128
ompi_datatype_t * target_datatype , ompi_op_t * op , ompi_osc_rdma_request_t * request )
193
129
{
@@ -222,8 +158,7 @@ static inline int ompi_osc_rdma_gacc_contig (ompi_osc_rdma_sync_t *sync, const v
222
158
uint32_t iov_count = 1 ;
223
159
size_t size = request -> len ;
224
160
225
- opal_convertor_unpack (& request -> convertor , & iov , & iov_count , & size );
226
- opal_convertor_cleanup (& request -> convertor );
161
+ opal_convertor_unpack (result_convertor , & iov , & iov_count , & size );
227
162
} else {
228
163
/* copy contiguous data to the result buffer */
229
164
ompi_datatype_sndrcv (ptr , len , MPI_BYTE , result , result_count , result_datatype );
@@ -265,7 +200,7 @@ static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const v
265
200
struct iovec source_iovec [OMPI_OSC_RDMA_DECODE_MAX ], target_iovec [OMPI_OSC_RDMA_DECODE_MAX ];
266
201
const size_t acc_limit = (mca_osc_rdma_component .buffer_size >> 3 );
267
202
uint32_t source_primitive_count , target_primitive_count ;
268
- opal_convertor_t source_convertor , target_convertor ;
203
+ opal_convertor_t source_convertor , target_convertor , result_convertor ;
269
204
uint32_t source_iov_count , target_iov_count ;
270
205
uint32_t source_iov_index , target_iov_index ;
271
206
ompi_datatype_t * source_primitive , * target_primitive ;
@@ -282,6 +217,13 @@ static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const v
282
217
request -> internal = true;
283
218
}
284
219
220
+ if (& ompi_mpi_op_no_op .op == op ) {
221
+ /* NTH: just zero these out to catch any coding errors (they should be ignored in the no-op case) */
222
+ source_count = 0 ;
223
+ source_datatype = NULL ;
224
+ source_addr = NULL ;
225
+ }
226
+
285
227
request -> cleanup = ompi_osc_rdma_gacc_master_cleanup ;
286
228
request -> type = result_datatype ? OMPI_OSC_RDMA_TYPE_GET_ACC : OMPI_OSC_RDMA_TYPE_ACC ;
287
229
@@ -304,7 +246,7 @@ static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const v
304
246
}
305
247
306
248
ret = ompi_osc_rdma_gacc_contig (sync , source_addr , source_count , source_datatype , result_addr ,
307
- result_count , result_datatype , peer , target_address ,
249
+ result_count , result_datatype , NULL , peer , target_address ,
308
250
target_handle , target_count , target_datatype , op ,
309
251
request );
310
252
if (OPAL_LIKELY (OMPI_SUCCESS == ret )) {
@@ -358,6 +300,20 @@ static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const v
358
300
if (OPAL_UNLIKELY (OMPI_SUCCESS != ret )) {
359
301
return ret ;
360
302
}
303
+ source_iov_count = 0 ;
304
+ } else {
305
+ source_iovec [0 ].iov_len = (size_t ) -1 ;
306
+ source_iovec [0 ].iov_base = NULL ;
307
+ source_iov_count = 1 ;
308
+ }
309
+
310
+ if (result_datatype ) {
311
+ OBJ_CONSTRUCT (& result_convertor , opal_convertor_t );
312
+ ret = opal_convertor_copy_and_prepare_for_recv (ompi_mpi_local_convertor , & result_datatype -> super , result_count , result_addr ,
313
+ 0 , & result_convertor );
314
+ if (OPAL_UNLIKELY (OMPI_SUCCESS != ret )) {
315
+ return ret ;
316
+ }
361
317
}
362
318
363
319
/* target_datatype can never be NULL */
@@ -373,59 +329,42 @@ static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const v
373
329
374
330
target_iov_index = 0 ;
375
331
target_iov_count = 0 ;
332
+ source_iov_index = 0 ;
376
333
result_position = 0 ;
377
334
subreq = NULL ;
378
335
379
336
do {
380
- /* decode segments of the source data */
381
- source_iov_count = OMPI_OSC_RDMA_DECODE_MAX ;
382
- source_iov_index = 0 ;
383
- /* opal_convertor_raw returns done when it has reached the end of the data */
384
- if (!source_datatype ) {
385
- done = true;
386
- source_iovec [0 ].iov_len = (size_t ) -1 ;
387
- source_iovec [0 ].iov_base = NULL ;
388
- source_iov_count = 1 ;
389
- } else {
390
- done = opal_convertor_raw (& source_convertor , source_iovec , & source_iov_count , & source_size );
391
- }
392
-
393
- /* loop on the target segments until we have exhaused the decoded source data */
394
- while (source_iov_index != source_iov_count ) {
395
- if (target_iov_index == target_iov_count ) {
396
- /* decode segments of the target buffer */
397
- target_iov_count = OMPI_OSC_RDMA_DECODE_MAX ;
398
- target_iov_index = 0 ;
399
- (void ) opal_convertor_raw (& target_convertor , target_iovec , & target_iov_count , & target_size );
337
+ /* decode segments of the target buffer */
338
+ target_iov_count = OMPI_OSC_RDMA_DECODE_MAX ;
339
+ target_iov_index = 0 ;
340
+ done = opal_convertor_raw (& target_convertor , target_iovec , & target_iov_count , & target_size );
341
+
342
+ /* loop on the source segments (if any) until we have exhaused the decoded target data */
343
+ while (target_iov_index != target_iov_count ) {
344
+ if (source_iov_count == source_iov_index ) {
345
+ /* decode segments of the source data */
346
+ source_iov_count = OMPI_OSC_RDMA_DECODE_MAX ;
347
+ source_iov_index = 0 ;
348
+ (void ) opal_convertor_raw (& source_convertor , source_iovec , & source_iov_count , & source_size );
400
349
}
401
350
402
351
/* we already checked that the target was large enough. this should be impossible */
403
352
assert (0 != target_iov_count );
404
353
405
354
/* determine how much to put in this operation */
406
- acc_len = min (target_iovec [target_iov_index ].iov_len , source_iovec [source_iov_index ].iov_len );
407
- acc_len = min ((size_t ) acc_len , acc_limit );
355
+ acc_len = min (min (target_iovec [target_iov_index ].iov_len , source_iovec [source_iov_index ].iov_len ), acc_limit );
408
356
409
- /* execute the get */
357
+ /* execute the get-accumulate */
410
358
if (!subreq ) {
411
359
OMPI_OSC_RDMA_REQUEST_ALLOC (module , peer , subreq );
412
360
subreq -> internal = true;
413
361
subreq -> parent_request = request ;
362
+ subreq -> type = result_datatype ? OMPI_OSC_RDMA_TYPE_GET_ACC : OMPI_OSC_RDMA_TYPE_ACC ;
414
363
(void ) OPAL_THREAD_ADD_FETCH32 (& request -> outstanding_requests , 1 );
415
364
}
416
365
417
- if (result_datatype ) {
418
- /* prepare a convertor for this part of the result */
419
- opal_convertor_copy_and_prepare_for_recv (ompi_mpi_local_convertor , & result_datatype -> super , result_count ,
420
- result_addr , 0 , & subreq -> convertor );
421
- opal_convertor_set_position (& subreq -> convertor , & result_position );
422
- subreq -> type = OMPI_OSC_RDMA_TYPE_GET_ACC ;
423
- } else {
424
- subreq -> type = OMPI_OSC_RDMA_TYPE_ACC ;
425
- }
426
-
427
366
ret = ompi_osc_rdma_gacc_contig (sync , source_iovec [source_iov_index ].iov_base , acc_len / target_primitive -> super .size ,
428
- target_primitive , NULL , 0 , NULL , peer ,
367
+ target_primitive , NULL , 0 , NULL , & result_convertor , peer ,
429
368
(uint64_t ) (intptr_t ) target_iovec [target_iov_index ].iov_base , target_handle ,
430
369
acc_len / target_primitive -> super .size , target_primitive , op , subreq );
431
370
if (OPAL_UNLIKELY (OMPI_SUCCESS != ret )) {
@@ -445,13 +384,16 @@ static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const v
445
384
446
385
/* adjust io vectors */
447
386
target_iovec [target_iov_index ].iov_len -= acc_len ;
448
- source_iovec [source_iov_index ].iov_len -= acc_len ;
449
387
target_iovec [target_iov_index ].iov_base = (void * )((intptr_t ) target_iovec [target_iov_index ].iov_base + acc_len );
450
- source_iovec [source_iov_index ].iov_base = (void * )((intptr_t ) source_iovec [source_iov_index ].iov_base + acc_len );
388
+ target_iov_index += (0 == target_iovec [target_iov_index ].iov_len );
389
+
451
390
result_position += acc_len ;
452
391
453
- source_iov_index += !source_datatype || (0 == source_iovec [source_iov_index ].iov_len );
454
- target_iov_index += (0 == target_iovec [target_iov_index ].iov_len );
392
+ if (source_datatype ) {
393
+ source_iov_index += (0 == source_iovec [source_iov_index ].iov_len );
394
+ source_iovec [source_iov_index ].iov_len -= acc_len ;
395
+ source_iovec [source_iov_index ].iov_base = (void * )((intptr_t ) source_iovec [source_iov_index ].iov_base + acc_len );
396
+ }
455
397
}
456
398
} while (!done );
457
399
@@ -463,6 +405,11 @@ static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const v
463
405
OBJ_DESTRUCT (& source_convertor );
464
406
}
465
407
408
+ if (result_datatype ) {
409
+ opal_convertor_cleanup (& result_convertor );
410
+ OBJ_DESTRUCT (& result_convertor );
411
+ }
412
+
466
413
OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_TRACE , "finished scheduling rdma on non-contiguous datatype(s)" );
467
414
468
415
opal_convertor_cleanup (& target_convertor );
@@ -589,9 +536,9 @@ static int ompi_osc_rdma_fetch_and_op_cas (ompi_osc_rdma_sync_t *sync, const voi
589
536
new_value = old_value ;
590
537
591
538
if (& ompi_mpi_op_replace .op == op ) {
592
- memcpy ((void * )((intptr_t ) & new_value + offset ), origin_addr , extent );
539
+ memcpy ((void * )((intptr_t ) & new_value + offset ), origin_addr + dt -> super . true_lb , extent );
593
540
} else if (& ompi_mpi_op_no_op .op != op ) {
594
- ompi_op_reduce (op , (void * ) origin_addr , (void * )((intptr_t ) & new_value + offset ), 1 , dt );
541
+ ompi_op_reduce (op , (void * ) origin_addr + dt -> super . true_lb , (void * )((intptr_t ) & new_value + offset ), 1 , dt );
595
542
}
596
543
597
544
ret = ompi_osc_rdma_btl_cswap (module , peer -> data_endpoint , address , target_handle ,
@@ -866,7 +813,7 @@ int ompi_osc_rdma_rget_accumulate_internal (ompi_osc_rdma_sync_t *sync, const vo
866
813
ompi_osc_rdma_module_t * module = sync -> module ;
867
814
mca_btl_base_registration_handle_t * target_handle ;
868
815
uint64_t target_address ;
869
- ptrdiff_t lb , origin_extent , target_span ;
816
+ ptrdiff_t lb , target_lb , origin_extent , target_span ;
870
817
bool lock_acquired = false;
871
818
int ret ;
872
819
@@ -879,11 +826,11 @@ int ompi_osc_rdma_rget_accumulate_internal (ompi_osc_rdma_sync_t *sync, const vo
879
826
return OMPI_SUCCESS ;
880
827
}
881
828
882
- target_span = opal_datatype_span (& target_datatype -> super , target_count , & lb );
829
+ target_span = opal_datatype_span (& target_datatype -> super , target_count , & target_lb );
883
830
884
831
// a buffer defined by (buf, count, dt)
885
832
// will have data starting at buf+offset and ending len bytes later:
886
- ret = osc_rdma_get_remote_segment (module , peer , target_disp , target_span + lb , & target_address , & target_handle );
833
+ ret = osc_rdma_get_remote_segment (module , peer , target_disp , target_span + target_lb , & target_address , & target_handle );
887
834
if (OPAL_UNLIKELY (OMPI_SUCCESS != ret )) {
888
835
return ret ;
889
836
}
@@ -916,10 +863,10 @@ int ompi_osc_rdma_rget_accumulate_internal (ompi_osc_rdma_sync_t *sync, const vo
916
863
if (origin_extent <= 8 && 1 == origin_count && !use_shared_mem ) {
917
864
if (module -> acc_use_amo && ompi_datatype_is_predefined (origin_datatype )) {
918
865
if (NULL == result_addr ) {
919
- ret = ompi_osc_rdma_acc_single_atomic (sync , origin_addr , origin_datatype , origin_extent , peer , target_address ,
866
+ ret = ompi_osc_rdma_acc_single_atomic (sync , origin_addr , origin_datatype , origin_extent , peer , target_address + target_lb ,
920
867
target_handle , op , request , lock_acquired );
921
868
} else {
922
- ret = ompi_osc_rdma_fetch_and_op_atomic (sync , origin_addr , result_addr , origin_datatype , origin_extent , peer , target_address ,
869
+ ret = ompi_osc_rdma_fetch_and_op_atomic (sync , origin_addr , result_addr , origin_datatype , origin_extent , peer , target_address + target_lb ,
923
870
target_handle , op , request , lock_acquired );
924
871
}
925
872
@@ -928,7 +875,7 @@ int ompi_osc_rdma_rget_accumulate_internal (ompi_osc_rdma_sync_t *sync, const vo
928
875
}
929
876
}
930
877
931
- ret = ompi_osc_rdma_fetch_and_op_cas (sync , origin_addr , result_addr , origin_datatype , origin_extent , peer , target_address ,
878
+ ret = ompi_osc_rdma_fetch_and_op_cas (sync , origin_addr , result_addr , origin_datatype , origin_extent , peer , target_address + target_lb ,
932
879
target_handle , op , request , lock_acquired );
933
880
if (OMPI_SUCCESS == ret ) {
934
881
return OMPI_SUCCESS ;
0 commit comments