Skip to content

Commit f11fea0

Browse files
authored
Merge pull request #5718 from mkurnosov/coll-iexscan-recursivedoubling
coll/libnbc: add recursive doubling algorithm for MPI_Iexscan
2 parents 1a14785 + 9557fa0 commit f11fea0

File tree

3 files changed

+245
-90
lines changed

3 files changed

+245
-90
lines changed

ompi/mca/coll/libnbc/coll_libnbc.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ BEGIN_C_DECLS
7070
#define NBC_NUM_COLL 17
7171

7272
extern bool libnbc_ibcast_skip_dt_decision;
73+
extern int libnbc_iexscan_algorithm;
7374
extern int libnbc_iscan_algorithm;
7475

7576
struct ompi_coll_libnbc_component_t {

ompi/mca/coll/libnbc/coll_libnbc_component.c

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,14 @@ static int libnbc_priority = 10;
4646
static bool libnbc_in_progress = false; /* protect from recursive calls */
4747
bool libnbc_ibcast_skip_dt_decision = true;
4848

49+
int libnbc_iexscan_algorithm = 0; /* iexscan user forced algorithm */
50+
static mca_base_var_enum_value_t iexscan_algorithms[] = {
51+
{0, "ignore"},
52+
{1, "linear"},
53+
{2, "recursive_doubling"},
54+
{0, NULL}
55+
};
56+
4957
int libnbc_iscan_algorithm = 0; /* iscan user forced algorithm */
5058
static mca_base_var_enum_value_t iscan_algorithms[] = {
5159
{0, "ignore"},
@@ -167,6 +175,16 @@ libnbc_register(void)
167175
MCA_BASE_VAR_SCOPE_READONLY,
168176
&libnbc_ibcast_skip_dt_decision);
169177

178+
libnbc_iexscan_algorithm = 0;
179+
(void) mca_base_var_enum_create("coll_libnbc_iexscan_algorithms", iexscan_algorithms, &new_enum);
180+
mca_base_component_var_register(&mca_coll_libnbc_component.super.collm_version,
181+
"iexscan_algorithm",
182+
"Which iexscan algorithm is used: 0 ignore, 1 linear, 2 recursive_doubling",
183+
MCA_BASE_VAR_TYPE_INT, new_enum, 0, MCA_BASE_VAR_FLAG_SETTABLE,
184+
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_ALL,
185+
&libnbc_iexscan_algorithm);
186+
OBJ_RELEASE(new_enum);
187+
170188
libnbc_iscan_algorithm = 0;
171189
(void) mca_base_var_enum_create("coll_libnbc_iscan_algorithms", iscan_algorithms, &new_enum);
172190
mca_base_component_var_register(&mca_coll_libnbc_component.super.collm_version,

ompi/mca/coll/libnbc/nbc_iexscan.c

Lines changed: 226 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,20 @@
1818
* Author(s): Torsten Hoefler <htor@cs.indiana.edu>
1919
*
2020
*/
21+
#include "opal/include/opal/align.h"
22+
#include "ompi/op/op.h"
23+
2124
#include "nbc_internal.h"
2225

26+
static inline int exscan_sched_linear(
27+
int rank, int comm_size, const void *sendbuf, void *recvbuf, int count,
28+
MPI_Datatype datatype, MPI_Op op, char inplace, NBC_Schedule *schedule,
29+
void *tmpbuf);
30+
static inline int exscan_sched_recursivedoubling(
31+
int rank, int comm_size, const void *sendbuf, void *recvbuf,
32+
int count, MPI_Datatype datatype, MPI_Op op, char inplace,
33+
NBC_Schedule *schedule, void *tmpbuf1, void *tmpbuf2);
34+
2335
#ifdef NBC_CACHE_SCHEDULE
2436
/* tree comparison function for schedule cache */
2537
int NBC_Scan_args_compare(NBC_Scan_args *a, NBC_Scan_args *b, void *param) {
@@ -39,32 +51,44 @@ int NBC_Scan_args_compare(NBC_Scan_args *a, NBC_Scan_args *b, void *param) {
3951
}
4052
#endif
4153

42-
/* linear iexscan
43-
* working principle:
44-
* 1. each node (but node 0) receives from left neigbor
45-
* 2. performs op
46-
* 3. all but rank p-1 do sends to it's right neigbor and exits
47-
*
48-
*/
4954
static int nbc_exscan_init(const void* sendbuf, void* recvbuf, int count, MPI_Datatype datatype, MPI_Op op,
5055
struct ompi_communicator_t *comm, ompi_request_t ** request,
5156
struct mca_coll_base_module_2_3_0_t *module, bool persistent) {
5257
int rank, p, res;
53-
ptrdiff_t gap, span;
5458
NBC_Schedule *schedule;
55-
#ifdef NBC_CACHE_SCHEDULE
56-
NBC_Scan_args *args, *found, search;
57-
#endif
5859
char inplace;
59-
void *tmpbuf = NULL;
60+
void *tmpbuf = NULL, *tmpbuf1 = NULL, *tmpbuf2 = NULL;
61+
enum { NBC_EXSCAN_LINEAR, NBC_EXSCAN_RDBL } alg;
6062
ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module;
63+
ptrdiff_t span, gap;
6164

6265
NBC_IN_PLACE(sendbuf, recvbuf, inplace);
6366

64-
rank = ompi_comm_rank (comm);
65-
p = ompi_comm_size (comm);
67+
rank = ompi_comm_rank(comm);
68+
p = ompi_comm_size(comm);
69+
70+
if (p < 2) {
71+
return nbc_get_noop_request(persistent, request);
72+
}
73+
74+
span = opal_datatype_span(&datatype->super, count, &gap);
75+
if (libnbc_iexscan_algorithm == 2) {
76+
alg = NBC_EXSCAN_RDBL;
77+
ptrdiff_t span_align = OPAL_ALIGN(span, datatype->super.align, ptrdiff_t);
78+
tmpbuf = malloc(span_align + span);
79+
if (NULL == tmpbuf) { return OMPI_ERR_OUT_OF_RESOURCE; }
80+
tmpbuf1 = (void *)(-gap);
81+
tmpbuf2 = (char *)(span_align) - gap;
82+
} else {
83+
alg = NBC_EXSCAN_LINEAR;
84+
if (rank > 0) {
85+
tmpbuf = malloc(span);
86+
if (NULL == tmpbuf) { return OMPI_ERR_OUT_OF_RESOURCE; }
87+
}
88+
}
6689

6790
#ifdef NBC_CACHE_SCHEDULE
91+
NBC_Scan_args *args, *found, search;
6892
/* search schedule in communicator specific tree */
6993
search.sendbuf = sendbuf;
7094
search.recvbuf = recvbuf;
@@ -74,84 +98,31 @@ static int nbc_exscan_init(const void* sendbuf, void* recvbuf, int count, MPI_Da
7498
found = (NBC_Scan_args *) hb_tree_search ((hb_tree *) libnbc_module->NBC_Dict[NBC_EXSCAN], &search);
7599
if (NULL == found) {
76100
#endif
77-
schedule = OBJ_NEW(NBC_Schedule);
78-
if (OPAL_UNLIKELY(NULL == schedule)) {
79-
free(tmpbuf);
80-
return OMPI_ERR_OUT_OF_RESOURCE;
81-
}
82-
83-
if (rank != 0) {
84-
span = opal_datatype_span(&datatype->super, count, &gap);
85-
tmpbuf = malloc(span);
86-
if (NULL == tmpbuf) {
87-
return OMPI_ERR_OUT_OF_RESOURCE;
88-
}
89-
if (inplace) {
90-
res = NBC_Sched_copy(recvbuf, false, count, datatype,
91-
(char *)tmpbuf-gap, false, count, datatype, schedule, false);
92-
} else {
93-
res = NBC_Sched_copy((void *)sendbuf, false, count, datatype,
94-
(char *)tmpbuf-gap, false, count, datatype, schedule, false);
95-
}
96-
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
97-
OBJ_RELEASE(schedule);
98-
free(tmpbuf);
99-
return res;
100-
}
101-
102-
res = NBC_Sched_recv (recvbuf, false, count, datatype, rank-1, schedule, false);
103-
104-
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
105-
OBJ_RELEASE(schedule);
106-
free(tmpbuf);
107-
return res;
108-
}
109-
110-
if (rank < p - 1) {
111-
/* we have to wait until we have the data */
112-
res = NBC_Sched_barrier(schedule);
113-
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
114-
OBJ_RELEASE(schedule);
115-
free(tmpbuf);
116-
return res;
117-
}
118-
119-
res = NBC_Sched_op (recvbuf, false, (void *)(-gap), true, count,
120-
datatype, op, schedule, true);
121-
122-
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
123-
OBJ_RELEASE(schedule);
124-
free(tmpbuf);
125-
return res;
126-
}
101+
schedule = OBJ_NEW(NBC_Schedule);
102+
if (OPAL_UNLIKELY(NULL == schedule)) {
103+
free(tmpbuf);
104+
return OMPI_ERR_OUT_OF_RESOURCE;
105+
}
127106

128-
/* send reduced data onward */
129-
res = NBC_Sched_send ((void *)(-gap), true, count, datatype, rank + 1, schedule, false);
130-
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
131-
OBJ_RELEASE(schedule);
132-
free(tmpbuf);
133-
return res;
134-
}
135-
}
136-
} else if (p > 1) {
137-
if (inplace) {
138-
res = NBC_Sched_send (recvbuf, false, count, datatype, 1, schedule, false);
139-
} else {
140-
res = NBC_Sched_send (sendbuf, false, count, datatype, 1, schedule, false);
141-
}
142-
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
143-
OBJ_RELEASE(schedule);
144-
free(tmpbuf);
145-
return res;
146-
}
147-
}
107+
if (alg == NBC_EXSCAN_LINEAR) {
108+
res = exscan_sched_linear(rank, p, sendbuf, recvbuf, count, datatype,
109+
op, inplace, schedule, tmpbuf);
110+
} else {
111+
res = exscan_sched_recursivedoubling(rank, p, sendbuf, recvbuf, count,
112+
datatype, op, inplace, schedule, tmpbuf1, tmpbuf2);
113+
}
114+
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
115+
OBJ_RELEASE(schedule);
116+
free(tmpbuf);
117+
return res;
118+
}
148119

149-
res = NBC_Sched_commit(schedule);
150-
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
151-
OBJ_RELEASE(schedule);
152-
free(tmpbuf);
153-
return res;
154-
}
120+
res = NBC_Sched_commit(schedule);
121+
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
122+
OBJ_RELEASE(schedule);
123+
free(tmpbuf);
124+
return res;
125+
}
155126

156127
#ifdef NBC_CACHE_SCHEDULE
157128
/* save schedule to tree */
@@ -224,3 +195,168 @@ int ompi_coll_libnbc_exscan_init(const void* sendbuf, void* recvbuf, int count,
224195

225196
return OMPI_SUCCESS;
226197
}
198+
199+
/*
200+
* exscan_sched_linear:
201+
*
202+
* Function: Linear algorithm for exclusive scan.
203+
* Accepts: Same as MPI_Iexscan
204+
* Returns: MPI_SUCCESS or error code
205+
*
206+
* Working principle:
207+
* 1. Each process (but process 0) receives from left neighbor
208+
* 2. Performs op
209+
* 3. All but rank p - 1 do sends to it's right neighbor and exits
210+
*
211+
* Schedule length: O(1)
212+
*/
213+
static inline int exscan_sched_linear(
214+
int rank, int comm_size, const void *sendbuf, void *recvbuf, int count,
215+
MPI_Datatype datatype, MPI_Op op, char inplace, NBC_Schedule *schedule,
216+
void *tmpbuf)
217+
{
218+
int res = OMPI_SUCCESS;
219+
ptrdiff_t gap;
220+
opal_datatype_span(&datatype->super, count, &gap);
221+
222+
if (rank > 0) {
223+
if (inplace) {
224+
res = NBC_Sched_copy(recvbuf, false, count, datatype,
225+
(char *)tmpbuf - gap, false, count, datatype, schedule, false);
226+
} else {
227+
res = NBC_Sched_copy((void *)sendbuf, false, count, datatype,
228+
(char *)tmpbuf - gap, false, count, datatype, schedule, false);
229+
}
230+
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; }
231+
232+
res = NBC_Sched_recv(recvbuf, false, count, datatype, rank - 1, schedule, false);
233+
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; }
234+
235+
if (rank < comm_size - 1) {
236+
/* We have to wait until we have the data */
237+
res = NBC_Sched_barrier(schedule);
238+
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; }
239+
240+
res = NBC_Sched_op(recvbuf, false, (void *)(-gap), true, count,
241+
datatype, op, schedule, true);
242+
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; }
243+
244+
/* Send reduced data onward */
245+
res = NBC_Sched_send ((void *)(-gap), true, count, datatype, rank + 1, schedule, false);
246+
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; }
247+
}
248+
} else if (comm_size > 1) {
249+
/* Process 0 */
250+
if (inplace) {
251+
res = NBC_Sched_send(recvbuf, false, count, datatype, 1, schedule, false);
252+
} else {
253+
res = NBC_Sched_send(sendbuf, false, count, datatype, 1, schedule, false);
254+
}
255+
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; }
256+
}
257+
258+
cleanup_and_return:
259+
return res;
260+
}
261+
262+
/*
263+
* exscan_sched_recursivedoubling:
264+
*
265+
* Function: Recursive doubling algorithm for exclusive scan.
266+
* Accepts: Same as MPI_Iexscan
267+
* Returns: MPI_SUCCESS or error code
268+
*
269+
* Description: Implements recursive doubling algorithm for MPI_Iexscan.
270+
* The algorithm preserves order of operations so it can
271+
* be used both by commutative and non-commutative operations.
272+
*
273+
* Example for 5 processes and commutative operation MPI_SUM:
274+
* Process: 0 1 2 3 4
275+
* recvbuf: - - - - -
276+
* psend: [0] [1] [2] [3] [4]
277+
*
278+
* Step 1:
279+
* recvbuf: - [0] - [2] -
280+
* psend: [1+0] [0+1] [3+2] [2+3] [4]
281+
*
282+
* Step 2:
283+
* recvbuf: - [0] [1+0] [(0+1)+2] -
284+
* psend: [(3+2)+(1+0)] [(2+3)+(0+1)] [(1+0)+(3+2)] [(1+0)+(2+3)] [4]
285+
*
286+
* Step 3:
287+
* recvbuf: - [0] [1+0] [(0+1)+2] [(3+2)+(1+0)]
288+
* psend: [4+((3+2)+(1+0))] [((3+2)+(1+0))+4]
289+
*
290+
* Time complexity (worst case): \ceil(\log_2(p))(2\alpha + 2m\beta + 2m\gamma)
291+
* Memory requirements (per process): 2 * count * typesize = O(count)
292+
* Limitations: intra-communicators only
293+
* Schedule length: O(log(p))
294+
*/
295+
static inline int exscan_sched_recursivedoubling(
296+
int rank, int comm_size, const void *sendbuf, void *recvbuf, int count,
297+
MPI_Datatype datatype, MPI_Op op, char inplace,
298+
NBC_Schedule *schedule, void *tmpbuf1, void *tmpbuf2)
299+
{
300+
int res = OMPI_SUCCESS;
301+
char *psend = (char *)tmpbuf1;
302+
char *precv = (char *)tmpbuf2;
303+
304+
if (!inplace) {
305+
res = NBC_Sched_copy((char *)sendbuf, false, count, datatype,
306+
psend, true, count, datatype, schedule, true);
307+
} else {
308+
res = NBC_Sched_copy((char *)recvbuf, false, count, datatype,
309+
psend, true, count, datatype, schedule, true);
310+
}
311+
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; }
312+
313+
int is_commute = ompi_op_is_commute(op);
314+
int is_first_block = 1;
315+
316+
for (int mask = 1; mask < comm_size; mask <<= 1) {
317+
int remote = rank ^ mask;
318+
if (remote < comm_size) {
319+
res = NBC_Sched_send(psend, true, count, datatype, remote, schedule, false);
320+
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; }
321+
res = NBC_Sched_recv(precv, true, count, datatype, remote, schedule, true);
322+
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; }
323+
324+
if (rank > remote) {
325+
/* Assertion: rank > 0 and rbuf is valid */
326+
if (is_first_block) {
327+
res = NBC_Sched_copy(precv, true, count, datatype,
328+
recvbuf, false, count, datatype, schedule, false);
329+
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; }
330+
is_first_block = 0;
331+
} else {
332+
/* Accumulate prefix reduction: recvbuf = precv <op> recvbuf */
333+
res = NBC_Sched_op(precv, true, recvbuf, false, count,
334+
datatype, op, schedule, false);
335+
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; }
336+
}
337+
/* Partial result: psend = precv <op> psend */
338+
res = NBC_Sched_op(precv, true, psend, true, count,
339+
datatype, op, schedule, true);
340+
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; }
341+
} else {
342+
if (is_commute) {
343+
/* psend = precv <op> psend */
344+
res = NBC_Sched_op(precv, true, psend, true, count,
345+
datatype, op, schedule, true);
346+
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; }
347+
} else {
348+
/* precv = psend <op> precv */
349+
res = NBC_Sched_op(psend, true, precv, true, count,
350+
datatype, op, schedule, true);
351+
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; }
352+
char *tmp = psend;
353+
psend = precv;
354+
precv = tmp;
355+
}
356+
}
357+
}
358+
}
359+
360+
cleanup_and_return:
361+
return res;
362+
}

0 commit comments

Comments
 (0)