20
20
*/
21
21
#include "nbc_internal.h"
22
22
23
+ static inline int allgather_sched_linear (
24
+ int rank , int comm_size , NBC_Schedule * schedule , const void * sendbuf ,
25
+ int scount , struct ompi_datatype_t * sdtype , void * recvbuf , int rcount ,
26
+ struct ompi_datatype_t * rdtype );
27
+ static inline int allgather_sched_recursivedoubling (
28
+ int rank , int comm_size , NBC_Schedule * schedule , const void * sbuf ,
29
+ int scount , struct ompi_datatype_t * sdtype , void * rbuf , int rcount ,
30
+ struct ompi_datatype_t * rdtype );
31
+
23
32
#ifdef NBC_CACHE_SCHEDULE
24
33
/* tree comparison function for schedule cache */
25
34
int NBC_Allgather_args_compare (NBC_Allgather_args * a , NBC_Allgather_args * b , void * param ) {
@@ -40,27 +49,38 @@ int NBC_Allgather_args_compare(NBC_Allgather_args *a, NBC_Allgather_args *b, voi
40
49
}
41
50
#endif
42
51
43
- /* simple linear MPI_Iallgather
44
- * the algorithm uses p-1 rounds
45
- * each node sends the packet it received last round (or has in round 0) to it's right neighbor (modulo p)
46
- * each node receives from it's left (modulo p) neighbor */
47
52
static int nbc_allgather_init (const void * sendbuf , int sendcount , MPI_Datatype sendtype , void * recvbuf , int recvcount ,
48
53
MPI_Datatype recvtype , struct ompi_communicator_t * comm , ompi_request_t * * request ,
49
54
struct mca_coll_base_module_2_3_0_t * module , bool persistent )
50
55
{
51
56
int rank , p , res ;
52
57
MPI_Aint rcvext ;
53
58
NBC_Schedule * schedule ;
54
- char * rbuf , * sbuf , inplace ;
59
+ char * rbuf , inplace ;
55
60
#ifdef NBC_CACHE_SCHEDULE
56
61
NBC_Allgather_args * args , * found , search ;
57
62
#endif
63
+ enum { NBC_ALLGATHER_LINEAR , NBC_ALLGATHER_RDBL } alg ;
58
64
ompi_coll_libnbc_module_t * libnbc_module = (ompi_coll_libnbc_module_t * ) module ;
59
65
60
66
NBC_IN_PLACE (sendbuf , recvbuf , inplace );
61
67
62
68
rank = ompi_comm_rank (comm );
63
69
p = ompi_comm_size (comm );
70
+ int is_commsize_pow2 = !(p & (p - 1 ));
71
+
72
+ if (libnbc_iallgather_algorithm == 0 ) {
73
+ alg = NBC_ALLGATHER_LINEAR ;
74
+ } else {
75
+ /* user forced dynamic decision */
76
+ if (libnbc_iallgather_algorithm == 1 ) {
77
+ alg = NBC_ALLGATHER_LINEAR ;
78
+ } else if (libnbc_iallgather_algorithm == 2 && is_commsize_pow2 ) {
79
+ alg = NBC_ALLGATHER_RDBL ;
80
+ } else {
81
+ alg = NBC_ALLGATHER_LINEAR ;
82
+ }
83
+ }
64
84
65
85
res = ompi_datatype_type_extent (recvtype , & rcvext );
66
86
if (MPI_SUCCESS != res ) {
@@ -98,36 +118,34 @@ static int nbc_allgather_init(const void* sendbuf, int sendcount, MPI_Datatype s
98
118
return OMPI_ERR_OUT_OF_RESOURCE ;
99
119
}
100
120
101
- sbuf = (char * )recvbuf + rank * recvcount * rcvext ;
102
-
103
- if (persistent && !inplace ) { /* for nonblocking, data has been copied already */
121
+ if (persistent && !inplace ) {
122
+ /* for nonblocking, data has been copied already */
104
123
/* copy my data to receive buffer (= send buffer of NBC_Sched_send) */
105
- res = NBC_Sched_copy ((void * )sendbuf , false, sendcount , sendtype ,
106
- sbuf , false, recvcount , recvtype , schedule , true);
124
+ rbuf = (char * )recvbuf + rank * recvcount * rcvext ;
125
+ res = NBC_Sched_copy ((void * )sendbuf , false, sendcount , sendtype ,
126
+ rbuf , false, recvcount , recvtype , schedule , true);
107
127
if (OPAL_UNLIKELY (OMPI_SUCCESS != res )) {
108
128
OBJ_RELEASE (schedule );
109
129
return res ;
110
130
}
111
131
}
112
132
113
- /* do p-1 rounds */
114
- for (int r = 0 ; r < p ; ++ r ) {
115
- if (r != rank ) {
116
- /* recv from rank r */
117
- rbuf = (char * )recvbuf + r * recvcount * rcvext ;
118
- res = NBC_Sched_recv (rbuf , false, recvcount , recvtype , r , schedule , false);
119
- if (OPAL_UNLIKELY (OMPI_SUCCESS != res )) {
120
- OBJ_RELEASE (schedule );
121
- return res ;
122
- }
133
+ switch (alg ) {
134
+ case NBC_ALLGATHER_LINEAR :
135
+ if (rank == 0 ) printf ("MK: LINEAR\n" );
136
+ res = allgather_sched_linear (rank , p , schedule , sendbuf , sendcount , sendtype ,
137
+ recvbuf , recvcount , recvtype );
138
+ break ;
139
+ case NBC_ALLGATHER_RDBL :
140
+ if (rank == 0 ) printf ("MK: RDBL\n" );
141
+ res = allgather_sched_recursivedoubling (rank , p , schedule , sendbuf , sendcount ,
142
+ sendtype , recvbuf , recvcount , recvtype );
143
+ break ;
144
+ }
123
145
124
- /* send to rank r - not from the sendbuf to optimize MPI_IN_PLACE */
125
- res = NBC_Sched_send (sbuf , false, recvcount , recvtype , r , schedule , false);
126
- if (OPAL_UNLIKELY (OMPI_SUCCESS != res )) {
127
- OBJ_RELEASE (schedule );
128
- return res ;
129
- }
130
- }
146
+ if (OPAL_UNLIKELY (OMPI_SUCCESS != res )) {
147
+ OBJ_RELEASE (schedule );
148
+ return res ;
131
149
}
132
150
133
151
res = NBC_Sched_commit (schedule );
@@ -270,6 +288,109 @@ int ompi_coll_libnbc_iallgather_inter(const void* sendbuf, int sendcount, MPI_Da
270
288
return OMPI_SUCCESS ;
271
289
}
272
290
291
+ /*
292
+ * allgather_sched_linear
293
+ *
294
+ * Description: an implementation of Iallgather using linear algorithm
295
+ *
296
+ * Time: O(comm_size)
297
+ * Schedule length (rounds): O(comm_size)
298
+ */
299
+ static inline int allgather_sched_linear (
300
+ int rank , int comm_size , NBC_Schedule * schedule , const void * sendbuf ,
301
+ int scount , struct ompi_datatype_t * sdtype , void * recvbuf , int rcount ,
302
+ struct ompi_datatype_t * rdtype )
303
+ {
304
+ int res = OMPI_SUCCESS ;
305
+ ptrdiff_t rlb , rext ;
306
+
307
+ res = ompi_datatype_get_extent (rdtype , & rlb , & rext );
308
+ char * sbuf = (char * )recvbuf + rank * rcount * rext ;
309
+
310
+ for (int remote = 0 ; remote < comm_size ; ++ remote ) {
311
+ if (remote != rank ) {
312
+ /* Recv from rank remote */
313
+ char * rbuf = (char * )recvbuf + remote * rcount * rext ;
314
+ res = NBC_Sched_recv (rbuf , false, rcount , rdtype , remote , schedule , false);
315
+ if (OPAL_UNLIKELY (OMPI_SUCCESS != res )) { goto cleanup_and_return ; }
316
+
317
+ /* Send to rank remote - not from the sendbuf to optimize MPI_IN_PLACE */
318
+ res = NBC_Sched_send (sbuf , false, rcount , rdtype , remote , schedule , false);
319
+ if (OPAL_UNLIKELY (OMPI_SUCCESS != res )) { goto cleanup_and_return ; }
320
+ }
321
+ }
322
+
323
+ cleanup_and_return :
324
+ return res ;
325
+ }
326
+
327
+ /*
328
+ * allgather_sched_recursivedoubling
329
+ *
330
+ * Description: an implementation of Iallgather using recursive doubling algorithm
331
+ * Limitation: power-of-two number of processes only
332
+ * Time: O(log(comm_size))
333
+ * Schedule length (rounds): O(log(comm_size))
334
+ * Memory: no additional memory requirements beyond user-supplied buffers.
335
+ *
336
+ * Example on 4 nodes:
337
+ * Initialization: everyone has its own buffer at location rank in rbuf
338
+ * # 0 1 2 3
339
+ * [0] [ ] [ ] [ ]
340
+ * [ ] [1] [ ] [ ]
341
+ * [ ] [ ] [2] [ ]
342
+ * [ ] [ ] [ ] [3]
343
+ * Step 0: exchange data with (rank ^ 2^0)
344
+ * # 0 1 2 3
345
+ * [0] [0] [ ] [ ]
346
+ * [1] [1] [ ] [ ]
347
+ * [ ] [ ] [2] [2]
348
+ * [ ] [ ] [3] [3]
349
+ * Step 1: exchange data with (rank ^ 2^1) (if you can)
350
+ * # 0 1 2 3
351
+ * [0] [0] [0] [0]
352
+ * [1] [1] [1] [1]
353
+ * [2] [2] [2] [2]
354
+ * [3] [3] [3] [3]
355
+ *
356
+ */
357
+ static inline int allgather_sched_recursivedoubling (
358
+ int rank , int comm_size , NBC_Schedule * schedule , const void * sbuf ,
359
+ int scount , struct ompi_datatype_t * sdtype , void * rbuf , int rcount ,
360
+ struct ompi_datatype_t * rdtype )
361
+ {
362
+ int res = OMPI_SUCCESS ;
363
+ ptrdiff_t rlb , rext ;
364
+ char * tmpsend = NULL , * tmprecv = NULL ;
365
+
366
+ res = ompi_datatype_get_extent (rdtype , & rlb , & rext );
367
+ if (OPAL_UNLIKELY (OMPI_SUCCESS != res )) { goto cleanup_and_return ; }
368
+
369
+ int sendblocklocation = rank ;
370
+ for (int distance = 1 ; distance < comm_size ; distance <<= 1 ) {
371
+ int remote = rank ^ distance ;
372
+
373
+ tmpsend = (char * )rbuf + (ptrdiff_t )sendblocklocation * (ptrdiff_t )rcount * rext ;
374
+ if (rank < remote ) {
375
+ tmprecv = (char * )rbuf + (ptrdiff_t )(sendblocklocation + distance ) * (ptrdiff_t )rcount * rext ;
376
+ } else {
377
+ tmprecv = (char * )rbuf + (ptrdiff_t )(sendblocklocation - distance ) * (ptrdiff_t )rcount * rext ;
378
+ sendblocklocation -= distance ;
379
+ }
380
+
381
+ res = NBC_Sched_send (tmpsend , false, (ptrdiff_t )distance * (ptrdiff_t )rcount ,
382
+ rdtype , remote , schedule , false);
383
+ if (OPAL_UNLIKELY (OMPI_SUCCESS != res )) { goto cleanup_and_return ; }
384
+
385
+ res = NBC_Sched_recv (tmprecv , false, (ptrdiff_t )distance * (ptrdiff_t )rcount ,
386
+ rdtype , remote , schedule , true);
387
+ if (OPAL_UNLIKELY (OMPI_SUCCESS != res )) { goto cleanup_and_return ; }
388
+ }
389
+
390
+ cleanup_and_return :
391
+ return res ;
392
+ }
393
+
273
394
int ompi_coll_libnbc_allgather_init (const void * sendbuf , int sendcount , MPI_Datatype sendtype , void * recvbuf , int recvcount ,
274
395
MPI_Datatype recvtype , struct ompi_communicator_t * comm , MPI_Info info , ompi_request_t * * request ,
275
396
struct mca_coll_base_module_2_3_0_t * module ) {
0 commit comments