Skip to content

Commit ff03f43

Browse files
committed
Fix the libnbc MPILAllreduce ring algorithm.
The ring version did not support MPI_IN_PLACE and had no protection against it, so instead of nicely bailing out was delivering wrong results. This patch addresses this issue, simplifies the computation of the segment sizes, and minimize the temporary memory size. Fixes #9385. Signed-off-by: George Bosilca <bosilca@icl.utk.edu>
1 parent f882930 commit ff03f43

File tree

1 file changed

+37
-42
lines changed

1 file changed

+37
-42
lines changed

ompi/mca/coll/libnbc/nbc_iallreduce.c

Lines changed: 37 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -112,15 +112,14 @@ static int nbc_allreduce_init(const void* sendbuf, void* recvbuf, int count, MPI
112112
return OMPI_ERR_OUT_OF_RESOURCE;
113113
}
114114

115+
alg = NBC_ARED_RING; /* default generic selection */
115116
/* algorithm selection */
116117
int nprocs_pof2 = opal_next_poweroftwo(p) >> 1;
117118
if (libnbc_iallreduce_algorithm == 0) {
118119
if(p < 4 || size*count < 65536 || !ompi_op_is_commute(op) || inplace) {
119120
alg = NBC_ARED_BINOMIAL;
120121
} else if (count >= nprocs_pof2 && ompi_op_is_commute(op)) {
121122
alg = NBC_ARED_REDSCAT_ALLGATHER;
122-
} else {
123-
alg = NBC_ARED_RING;
124123
}
125124
} else {
126125
if (libnbc_iallreduce_algorithm == 1)
@@ -131,8 +130,6 @@ static int nbc_allreduce_init(const void* sendbuf, void* recvbuf, int count, MPI
131130
alg = NBC_ARED_REDSCAT_ALLGATHER;
132131
else if (libnbc_iallreduce_algorithm == 4)
133132
alg = NBC_ARED_RDBL;
134-
else
135-
alg = NBC_ARED_RING;
136133
}
137134
#ifdef NBC_CACHE_SCHEDULE
138135
/* search schedule in communicator specific tree */
@@ -633,38 +630,37 @@ static inline int allred_sched_recursivedoubling(int rank, int p, const void *se
633630
return OMPI_SUCCESS;
634631
}
635632

636-
static inline int allred_sched_ring (int r, int p, int count, MPI_Datatype datatype, const void *sendbuf, void *recvbuf, MPI_Op op,
637-
int size, int ext, NBC_Schedule *schedule, void *tmpbuf) {
633+
static inline int
634+
allred_sched_ring(int r, int p,
635+
int count, MPI_Datatype datatype, const void *sendbuf, void *recvbuf,
636+
MPI_Op op, int size, int ext, NBC_Schedule *schedule, void *tmpbuf)
637+
{
638638
int segsize, *segsizes, *segoffsets; /* segment sizes and offsets per segment (number of segments == number of nodes */
639-
int speer, rpeer; /* send and recvpeer */
639+
int speer, rpeer; /* send and recv peers */
640640
int res = OMPI_SUCCESS;
641641

642-
if (count == 0) {
642+
if (0 == count) {
643643
return OMPI_SUCCESS;
644644
}
645645

646-
segsizes = (int *) malloc (sizeof (int) * p);
647-
segoffsets = (int *) malloc (sizeof (int) * p);
648-
if (NULL == segsizes || NULL == segoffsets) {
649-
free (segsizes);
650-
free (segoffsets);
646+
segsizes = (int *) malloc((2 * p + 1 ) *sizeof (int));
647+
if (NULL == segsizes) {
651648
return OMPI_ERR_OUT_OF_RESOURCE;
652649
}
650+
segoffsets = segsizes + p;
653651

654-
segsize = (count + p - 1) / p; /* size of the segments */
652+
segsize = count / p; /* size of the segments across the last ranks.
653+
The remainder will be evenly distributed across the smaller ranks */
655654

656655
segoffsets[0] = 0;
657-
for (int i = 0, mycount = count ; i < p ; ++i) {
658-
mycount -= segsize;
656+
for (int i = 0, mycount = count % p; i < p ; ++i) {
659657
segsizes[i] = segsize;
660-
if (mycount < 0) {
661-
segsizes[i] = segsize + mycount;
662-
mycount = 0;
658+
if( mycount > 0 ) { /* We have extra segments to distribute */
659+
segsizes[i]++;
660+
mycount--;
663661
}
664662

665-
if (i) {
666-
segoffsets[i] = segoffsets[i-1] + segsizes[i-1];
667-
}
663+
segoffsets[i+1] = segoffsets[i] + segsizes[i];
668664
}
669665

670666
/* reduce peers */
@@ -786,28 +782,29 @@ static inline int allred_sched_ring (int r, int p, int count, MPI_Datatype datat
786782
}
787783

788784
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
789-
break;
785+
goto free_and_return;
790786
}
791-
792-
res = NBC_Sched_recv ((char *) recvbuf + roffset, false, segsizes[relement], datatype, rpeer,
793-
schedule, true);
794-
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
795-
break;
787+
if( recvbuf != sendbuf ) { /* check for MPI_IN_PLACE */
788+
res = NBC_Sched_recv ((char *) recvbuf + roffset, false, segsizes[relement], datatype, rpeer,
789+
schedule, true);
790+
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
791+
goto free_and_return;
792+
}
793+
res = NBC_Sched_op ((char *) sendbuf + roffset, false, (char *) recvbuf + roffset, false,
794+
segsizes[relement], datatype, op, schedule, true);
795+
} else {
796+
res = NBC_Sched_recv ((char *) tmpbuf, false, segsizes[relement], datatype, rpeer,
797+
schedule, true);
798+
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
799+
goto free_and_return;
800+
}
801+
res = NBC_Sched_op ((char *) tmpbuf, false, (char *) recvbuf + roffset, false,
802+
segsizes[relement], datatype, op, schedule, true);
796803
}
797-
798-
res = NBC_Sched_op ((char *) sendbuf + roffset, false, (char *) recvbuf + roffset, false,
799-
segsizes[relement], datatype, op, schedule, true);
800804
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
801-
break;
805+
goto free_and_return;
802806
}
803807
}
804-
805-
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
806-
free (segsizes);
807-
free (segoffsets);
808-
return res;
809-
}
810-
811808
for (int round = p - 1 ; round < 2 * p - 2 ; ++round) {
812809
int selement = (r+1-round + 2*p /*2*p avoids negative mod*/)%p; /* the element I am sending */
813810
int soffset = segoffsets[selement]*ext;
@@ -819,16 +816,14 @@ static inline int allred_sched_ring (int r, int p, int count, MPI_Datatype datat
819816
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
820817
break;
821818
}
822-
823819
res = NBC_Sched_recv ((char *) recvbuf + roffset, false, segsizes[relement], datatype, rpeer,
824820
schedule, true);
825821
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
826822
break;
827823
}
828824
}
829-
825+
free_and_return:
830826
free (segsizes);
831-
free (segoffsets);
832827

833828
return res;
834829
}

0 commit comments

Comments
 (0)