18
18
* Author(s): Torsten Hoefler <htor@cs.indiana.edu>
19
19
*
20
20
*/
21
+ #include "opal/include/opal/align.h"
22
+ #include "ompi/op/op.h"
23
+
21
24
#include "nbc_internal.h"
22
25
26
+ static inline int exscan_sched_linear (
27
+ int rank , int comm_size , const void * sendbuf , void * recvbuf , int count ,
28
+ MPI_Datatype datatype , MPI_Op op , char inplace , NBC_Schedule * schedule ,
29
+ void * tmpbuf );
30
+ static inline int exscan_sched_recursivedoubling (
31
+ int rank , int comm_size , const void * sendbuf , void * recvbuf ,
32
+ int count , MPI_Datatype datatype , MPI_Op op , char inplace ,
33
+ NBC_Schedule * schedule , void * tmpbuf1 , void * tmpbuf2 );
34
+
23
35
#ifdef NBC_CACHE_SCHEDULE
24
36
/* tree comparison function for schedule cache */
25
37
int NBC_Scan_args_compare (NBC_Scan_args * a , NBC_Scan_args * b , void * param ) {
@@ -39,32 +51,44 @@ int NBC_Scan_args_compare(NBC_Scan_args *a, NBC_Scan_args *b, void *param) {
39
51
}
40
52
#endif
41
53
42
- /* linear iexscan
43
- * working principle:
44
- * 1. each node (but node 0) receives from left neigbor
45
- * 2. performs op
46
- * 3. all but rank p-1 do sends to it's right neigbor and exits
47
- *
48
- */
49
54
static int nbc_exscan_init (const void * sendbuf , void * recvbuf , int count , MPI_Datatype datatype , MPI_Op op ,
50
55
struct ompi_communicator_t * comm , ompi_request_t * * request ,
51
56
struct mca_coll_base_module_2_3_0_t * module , bool persistent ) {
52
57
int rank , p , res ;
53
- ptrdiff_t gap , span ;
54
58
NBC_Schedule * schedule ;
55
- #ifdef NBC_CACHE_SCHEDULE
56
- NBC_Scan_args * args , * found , search ;
57
- #endif
58
59
char inplace ;
59
- void * tmpbuf = NULL ;
60
+ void * tmpbuf = NULL , * tmpbuf1 = NULL , * tmpbuf2 = NULL ;
61
+ enum { NBC_EXSCAN_LINEAR , NBC_EXSCAN_RDBL } alg ;
60
62
ompi_coll_libnbc_module_t * libnbc_module = (ompi_coll_libnbc_module_t * ) module ;
63
+ ptrdiff_t span , gap ;
61
64
62
65
NBC_IN_PLACE (sendbuf , recvbuf , inplace );
63
66
64
- rank = ompi_comm_rank (comm );
65
- p = ompi_comm_size (comm );
67
+ rank = ompi_comm_rank (comm );
68
+ p = ompi_comm_size (comm );
69
+
70
+ if (p < 2 ) {
71
+ return nbc_get_noop_request (persistent , request );
72
+ }
73
+
74
+ span = opal_datatype_span (& datatype -> super , count , & gap );
75
+ if (libnbc_iexscan_algorithm == 2 ) {
76
+ alg = NBC_EXSCAN_RDBL ;
77
+ ptrdiff_t span_align = OPAL_ALIGN (span , datatype -> super .align , ptrdiff_t );
78
+ tmpbuf = malloc (span_align + span );
79
+ if (NULL == tmpbuf ) { return OMPI_ERR_OUT_OF_RESOURCE ; }
80
+ tmpbuf1 = (void * )(- gap );
81
+ tmpbuf2 = (char * )(span_align ) - gap ;
82
+ } else {
83
+ alg = NBC_EXSCAN_LINEAR ;
84
+ if (rank > 0 ) {
85
+ tmpbuf = malloc (span );
86
+ if (NULL == tmpbuf ) { return OMPI_ERR_OUT_OF_RESOURCE ; }
87
+ }
88
+ }
66
89
67
90
#ifdef NBC_CACHE_SCHEDULE
91
+ NBC_Scan_args * args , * found , search ;
68
92
/* search schedule in communicator specific tree */
69
93
search .sendbuf = sendbuf ;
70
94
search .recvbuf = recvbuf ;
@@ -74,84 +98,31 @@ static int nbc_exscan_init(const void* sendbuf, void* recvbuf, int count, MPI_Da
74
98
found = (NBC_Scan_args * ) hb_tree_search ((hb_tree * ) libnbc_module -> NBC_Dict [NBC_EXSCAN ], & search );
75
99
if (NULL == found ) {
76
100
#endif
77
- schedule = OBJ_NEW (NBC_Schedule );
78
- if (OPAL_UNLIKELY (NULL == schedule )) {
79
- free (tmpbuf );
80
- return OMPI_ERR_OUT_OF_RESOURCE ;
81
- }
82
-
83
- if (rank != 0 ) {
84
- span = opal_datatype_span (& datatype -> super , count , & gap );
85
- tmpbuf = malloc (span );
86
- if (NULL == tmpbuf ) {
87
- return OMPI_ERR_OUT_OF_RESOURCE ;
88
- }
89
- if (inplace ) {
90
- res = NBC_Sched_copy (recvbuf , false, count , datatype ,
91
- (char * )tmpbuf - gap , false, count , datatype , schedule , false);
92
- } else {
93
- res = NBC_Sched_copy ((void * )sendbuf , false, count , datatype ,
94
- (char * )tmpbuf - gap , false, count , datatype , schedule , false);
95
- }
96
- if (OPAL_UNLIKELY (OMPI_SUCCESS != res )) {
97
- OBJ_RELEASE (schedule );
98
- free (tmpbuf );
99
- return res ;
100
- }
101
-
102
- res = NBC_Sched_recv (recvbuf , false, count , datatype , rank - 1 , schedule , false);
103
-
104
- if (OPAL_UNLIKELY (OMPI_SUCCESS != res )) {
105
- OBJ_RELEASE (schedule );
106
- free (tmpbuf );
107
- return res ;
108
- }
109
-
110
- if (rank < p - 1 ) {
111
- /* we have to wait until we have the data */
112
- res = NBC_Sched_barrier (schedule );
113
- if (OPAL_UNLIKELY (OMPI_SUCCESS != res )) {
114
- OBJ_RELEASE (schedule );
115
- free (tmpbuf );
116
- return res ;
117
- }
118
-
119
- res = NBC_Sched_op (recvbuf , false, (void * )(- gap ), true, count ,
120
- datatype , op , schedule , true);
121
-
122
- if (OPAL_UNLIKELY (OMPI_SUCCESS != res )) {
123
- OBJ_RELEASE (schedule );
124
- free (tmpbuf );
125
- return res ;
126
- }
101
+ schedule = OBJ_NEW (NBC_Schedule );
102
+ if (OPAL_UNLIKELY (NULL == schedule )) {
103
+ free (tmpbuf );
104
+ return OMPI_ERR_OUT_OF_RESOURCE ;
105
+ }
127
106
128
- /* send reduced data onward */
129
- res = NBC_Sched_send ((void * )(- gap ), true, count , datatype , rank + 1 , schedule , false);
130
- if (OPAL_UNLIKELY (OMPI_SUCCESS != res )) {
131
- OBJ_RELEASE (schedule );
132
- free (tmpbuf );
133
- return res ;
134
- }
135
- }
136
- } else if (p > 1 ) {
137
- if (inplace ) {
138
- res = NBC_Sched_send (recvbuf , false, count , datatype , 1 , schedule , false);
139
- } else {
140
- res = NBC_Sched_send (sendbuf , false, count , datatype , 1 , schedule , false);
141
- }
142
- if (OPAL_UNLIKELY (OMPI_SUCCESS != res )) {
143
- OBJ_RELEASE (schedule );
144
- free (tmpbuf );
145
- return res ;
146
- }
147
- }
107
+ if (alg == NBC_EXSCAN_LINEAR ) {
108
+ res = exscan_sched_linear (rank , p , sendbuf , recvbuf , count , datatype ,
109
+ op , inplace , schedule , tmpbuf );
110
+ } else {
111
+ res = exscan_sched_recursivedoubling (rank , p , sendbuf , recvbuf , count ,
112
+ datatype , op , inplace , schedule , tmpbuf1 , tmpbuf2 );
113
+ }
114
+ if (OPAL_UNLIKELY (OMPI_SUCCESS != res )) {
115
+ OBJ_RELEASE (schedule );
116
+ free (tmpbuf );
117
+ return res ;
118
+ }
148
119
149
- res = NBC_Sched_commit (schedule );
150
- if (OPAL_UNLIKELY (OMPI_SUCCESS != res )) {
151
- OBJ_RELEASE (schedule );
152
- free (tmpbuf );
153
- return res ;
154
- }
120
+ res = NBC_Sched_commit (schedule );
121
+ if (OPAL_UNLIKELY (OMPI_SUCCESS != res )) {
122
+ OBJ_RELEASE (schedule );
123
+ free (tmpbuf );
124
+ return res ;
125
+ }
155
126
156
127
#ifdef NBC_CACHE_SCHEDULE
157
128
/* save schedule to tree */
@@ -224,3 +195,168 @@ int ompi_coll_libnbc_exscan_init(const void* sendbuf, void* recvbuf, int count,
224
195
225
196
return OMPI_SUCCESS ;
226
197
}
198
+
199
+ /*
200
+ * exscan_sched_linear:
201
+ *
202
+ * Function: Linear algorithm for exclusive scan.
203
+ * Accepts: Same as MPI_Iexscan
204
+ * Returns: MPI_SUCCESS or error code
205
+ *
206
+ * Working principle:
207
+ * 1. Each process (but process 0) receives from left neighbor
208
+ * 2. Performs op
209
+ * 3. All but rank p - 1 do sends to it's right neighbor and exits
210
+ *
211
+ * Schedule length: O(1)
212
+ */
213
+ static inline int exscan_sched_linear (
214
+ int rank , int comm_size , const void * sendbuf , void * recvbuf , int count ,
215
+ MPI_Datatype datatype , MPI_Op op , char inplace , NBC_Schedule * schedule ,
216
+ void * tmpbuf )
217
+ {
218
+ int res = OMPI_SUCCESS ;
219
+ ptrdiff_t gap ;
220
+ opal_datatype_span (& datatype -> super , count , & gap );
221
+
222
+ if (rank > 0 ) {
223
+ if (inplace ) {
224
+ res = NBC_Sched_copy (recvbuf , false, count , datatype ,
225
+ (char * )tmpbuf - gap , false, count , datatype , schedule , false);
226
+ } else {
227
+ res = NBC_Sched_copy ((void * )sendbuf , false, count , datatype ,
228
+ (char * )tmpbuf - gap , false, count , datatype , schedule , false);
229
+ }
230
+ if (OPAL_UNLIKELY (OMPI_SUCCESS != res )) { goto cleanup_and_return ; }
231
+
232
+ res = NBC_Sched_recv (recvbuf , false, count , datatype , rank - 1 , schedule , false);
233
+ if (OPAL_UNLIKELY (OMPI_SUCCESS != res )) { goto cleanup_and_return ; }
234
+
235
+ if (rank < comm_size - 1 ) {
236
+ /* We have to wait until we have the data */
237
+ res = NBC_Sched_barrier (schedule );
238
+ if (OPAL_UNLIKELY (OMPI_SUCCESS != res )) { goto cleanup_and_return ; }
239
+
240
+ res = NBC_Sched_op (recvbuf , false, (void * )(- gap ), true, count ,
241
+ datatype , op , schedule , true);
242
+ if (OPAL_UNLIKELY (OMPI_SUCCESS != res )) { goto cleanup_and_return ; }
243
+
244
+ /* Send reduced data onward */
245
+ res = NBC_Sched_send ((void * )(- gap ), true, count , datatype , rank + 1 , schedule , false);
246
+ if (OPAL_UNLIKELY (OMPI_SUCCESS != res )) { goto cleanup_and_return ; }
247
+ }
248
+ } else if (comm_size > 1 ) {
249
+ /* Process 0 */
250
+ if (inplace ) {
251
+ res = NBC_Sched_send (recvbuf , false, count , datatype , 1 , schedule , false);
252
+ } else {
253
+ res = NBC_Sched_send (sendbuf , false, count , datatype , 1 , schedule , false);
254
+ }
255
+ if (OPAL_UNLIKELY (OMPI_SUCCESS != res )) { goto cleanup_and_return ; }
256
+ }
257
+
258
+ cleanup_and_return :
259
+ return res ;
260
+ }
261
+
262
+ /*
263
+ * exscan_sched_recursivedoubling:
264
+ *
265
+ * Function: Recursive doubling algorithm for exclusive scan.
266
+ * Accepts: Same as MPI_Iexscan
267
+ * Returns: MPI_SUCCESS or error code
268
+ *
269
+ * Description: Implements recursive doubling algorithm for MPI_Iexscan.
270
+ * The algorithm preserves order of operations so it can
271
+ * be used both by commutative and non-commutative operations.
272
+ *
273
+ * Example for 5 processes and commutative operation MPI_SUM:
274
+ * Process: 0 1 2 3 4
275
+ * recvbuf: - - - - -
276
+ * psend: [0] [1] [2] [3] [4]
277
+ *
278
+ * Step 1:
279
+ * recvbuf: - [0] - [2] -
280
+ * psend: [1+0] [0+1] [3+2] [2+3] [4]
281
+ *
282
+ * Step 2:
283
+ * recvbuf: - [0] [1+0] [(0+1)+2] -
284
+ * psend: [(3+2)+(1+0)] [(2+3)+(0+1)] [(1+0)+(3+2)] [(1+0)+(2+3)] [4]
285
+ *
286
+ * Step 3:
287
+ * recvbuf: - [0] [1+0] [(0+1)+2] [(3+2)+(1+0)]
288
+ * psend: [4+((3+2)+(1+0))] [((3+2)+(1+0))+4]
289
+ *
290
+ * Time complexity (worst case): \ceil(\log_2(p))(2\alpha + 2m\beta + 2m\gamma)
291
+ * Memory requirements (per process): 2 * count * typesize = O(count)
292
+ * Limitations: intra-communicators only
293
+ * Schedule length: O(log(p))
294
+ */
295
+ static inline int exscan_sched_recursivedoubling (
296
+ int rank , int comm_size , const void * sendbuf , void * recvbuf , int count ,
297
+ MPI_Datatype datatype , MPI_Op op , char inplace ,
298
+ NBC_Schedule * schedule , void * tmpbuf1 , void * tmpbuf2 )
299
+ {
300
+ int res = OMPI_SUCCESS ;
301
+ char * psend = (char * )tmpbuf1 ;
302
+ char * precv = (char * )tmpbuf2 ;
303
+
304
+ if (!inplace ) {
305
+ res = NBC_Sched_copy ((char * )sendbuf , false, count , datatype ,
306
+ psend , true, count , datatype , schedule , true);
307
+ } else {
308
+ res = NBC_Sched_copy ((char * )recvbuf , false, count , datatype ,
309
+ psend , true, count , datatype , schedule , true);
310
+ }
311
+ if (OPAL_UNLIKELY (OMPI_SUCCESS != res )) { goto cleanup_and_return ; }
312
+
313
+ int is_commute = ompi_op_is_commute (op );
314
+ int is_first_block = 1 ;
315
+
316
+ for (int mask = 1 ; mask < comm_size ; mask <<= 1 ) {
317
+ int remote = rank ^ mask ;
318
+ if (remote < comm_size ) {
319
+ res = NBC_Sched_send (psend , true, count , datatype , remote , schedule , false);
320
+ if (OPAL_UNLIKELY (OMPI_SUCCESS != res )) { goto cleanup_and_return ; }
321
+ res = NBC_Sched_recv (precv , true, count , datatype , remote , schedule , true);
322
+ if (OPAL_UNLIKELY (OMPI_SUCCESS != res )) { goto cleanup_and_return ; }
323
+
324
+ if (rank > remote ) {
325
+ /* Assertion: rank > 0 and rbuf is valid */
326
+ if (is_first_block ) {
327
+ res = NBC_Sched_copy (precv , true, count , datatype ,
328
+ recvbuf , false, count , datatype , schedule , false);
329
+ if (OPAL_UNLIKELY (OMPI_SUCCESS != res )) { goto cleanup_and_return ; }
330
+ is_first_block = 0 ;
331
+ } else {
332
+ /* Accumulate prefix reduction: recvbuf = precv <op> recvbuf */
333
+ res = NBC_Sched_op (precv , true, recvbuf , false, count ,
334
+ datatype , op , schedule , false);
335
+ if (OPAL_UNLIKELY (OMPI_SUCCESS != res )) { goto cleanup_and_return ; }
336
+ }
337
+ /* Partial result: psend = precv <op> psend */
338
+ res = NBC_Sched_op (precv , true, psend , true, count ,
339
+ datatype , op , schedule , true);
340
+ if (OPAL_UNLIKELY (OMPI_SUCCESS != res )) { goto cleanup_and_return ; }
341
+ } else {
342
+ if (is_commute ) {
343
+ /* psend = precv <op> psend */
344
+ res = NBC_Sched_op (precv , true, psend , true, count ,
345
+ datatype , op , schedule , true);
346
+ if (OPAL_UNLIKELY (OMPI_SUCCESS != res )) { goto cleanup_and_return ; }
347
+ } else {
348
+ /* precv = psend <op> precv */
349
+ res = NBC_Sched_op (psend , true, precv , true, count ,
350
+ datatype , op , schedule , true);
351
+ if (OPAL_UNLIKELY (OMPI_SUCCESS != res )) { goto cleanup_and_return ; }
352
+ char * tmp = psend ;
353
+ psend = precv ;
354
+ precv = tmp ;
355
+ }
356
+ }
357
+ }
358
+ }
359
+
360
+ cleanup_and_return :
361
+ return res ;
362
+ }
0 commit comments