Skip to content

Commit 8702ff5

Browse files
authored
Merge pull request #9393 from bosilca/fix/9385
Fix the libnbc MPILAllreduce ring algorithm.
2 parents 855b523 + ff03f43 commit 8702ff5

File tree

1 file changed

+37
-42
lines changed

1 file changed

+37
-42
lines changed

ompi/mca/coll/libnbc/nbc_iallreduce.c

Lines changed: 37 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -112,15 +112,14 @@ static int nbc_allreduce_init(const void* sendbuf, void* recvbuf, int count, MPI
112112
return OMPI_ERR_OUT_OF_RESOURCE;
113113
}
114114

115+
alg = NBC_ARED_RING; /* default generic selection */
115116
/* algorithm selection */
116117
int nprocs_pof2 = opal_next_poweroftwo(p) >> 1;
117118
if (libnbc_iallreduce_algorithm == 0) {
118119
if(p < 4 || size*count < 65536 || !ompi_op_is_commute(op) || inplace) {
119120
alg = NBC_ARED_BINOMIAL;
120121
} else if (count >= nprocs_pof2 && ompi_op_is_commute(op)) {
121122
alg = NBC_ARED_REDSCAT_ALLGATHER;
122-
} else {
123-
alg = NBC_ARED_RING;
124123
}
125124
} else {
126125
if (libnbc_iallreduce_algorithm == 1)
@@ -131,8 +130,6 @@ static int nbc_allreduce_init(const void* sendbuf, void* recvbuf, int count, MPI
131130
alg = NBC_ARED_REDSCAT_ALLGATHER;
132131
else if (libnbc_iallreduce_algorithm == 4)
133132
alg = NBC_ARED_RDBL;
134-
else
135-
alg = NBC_ARED_RING;
136133
}
137134
#ifdef NBC_CACHE_SCHEDULE
138135
/* search schedule in communicator specific tree */
@@ -633,38 +630,37 @@ static inline int allred_sched_recursivedoubling(int rank, int p, const void *se
633630
return OMPI_SUCCESS;
634631
}
635632

636-
static inline int allred_sched_ring (int r, int p, int count, MPI_Datatype datatype, const void *sendbuf, void *recvbuf, MPI_Op op,
637-
int size, int ext, NBC_Schedule *schedule, void *tmpbuf) {
633+
static inline int
634+
allred_sched_ring(int r, int p,
635+
int count, MPI_Datatype datatype, const void *sendbuf, void *recvbuf,
636+
MPI_Op op, int size, int ext, NBC_Schedule *schedule, void *tmpbuf)
637+
{
638638
int segsize, *segsizes, *segoffsets; /* segment sizes and offsets per segment (number of segments == number of nodes */
639-
int speer, rpeer; /* send and recvpeer */
639+
int speer, rpeer; /* send and recv peers */
640640
int res = OMPI_SUCCESS;
641641

642-
if (count == 0) {
642+
if (0 == count) {
643643
return OMPI_SUCCESS;
644644
}
645645

646-
segsizes = (int *) malloc (sizeof (int) * p);
647-
segoffsets = (int *) malloc (sizeof (int) * p);
648-
if (NULL == segsizes || NULL == segoffsets) {
649-
free (segsizes);
650-
free (segoffsets);
646+
segsizes = (int *) malloc((2 * p + 1 ) *sizeof (int));
647+
if (NULL == segsizes) {
651648
return OMPI_ERR_OUT_OF_RESOURCE;
652649
}
650+
segoffsets = segsizes + p;
653651

654-
segsize = (count + p - 1) / p; /* size of the segments */
652+
segsize = count / p; /* size of the segments across the last ranks.
653+
The remainder will be evenly distributed across the smaller ranks */
655654

656655
segoffsets[0] = 0;
657-
for (int i = 0, mycount = count ; i < p ; ++i) {
658-
mycount -= segsize;
656+
for (int i = 0, mycount = count % p; i < p ; ++i) {
659657
segsizes[i] = segsize;
660-
if (mycount < 0) {
661-
segsizes[i] = segsize + mycount;
662-
mycount = 0;
658+
if( mycount > 0 ) { /* We have extra segments to distribute */
659+
segsizes[i]++;
660+
mycount--;
663661
}
664662

665-
if (i) {
666-
segoffsets[i] = segoffsets[i-1] + segsizes[i-1];
667-
}
663+
segoffsets[i+1] = segoffsets[i] + segsizes[i];
668664
}
669665

670666
/* reduce peers */
@@ -786,28 +782,29 @@ static inline int allred_sched_ring (int r, int p, int count, MPI_Datatype datat
786782
}
787783

788784
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
789-
break;
785+
goto free_and_return;
790786
}
791-
792-
res = NBC_Sched_recv ((char *) recvbuf + roffset, false, segsizes[relement], datatype, rpeer,
793-
schedule, true);
794-
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
795-
break;
787+
if( recvbuf != sendbuf ) { /* check for MPI_IN_PLACE */
788+
res = NBC_Sched_recv ((char *) recvbuf + roffset, false, segsizes[relement], datatype, rpeer,
789+
schedule, true);
790+
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
791+
goto free_and_return;
792+
}
793+
res = NBC_Sched_op ((char *) sendbuf + roffset, false, (char *) recvbuf + roffset, false,
794+
segsizes[relement], datatype, op, schedule, true);
795+
} else {
796+
res = NBC_Sched_recv ((char *) tmpbuf, false, segsizes[relement], datatype, rpeer,
797+
schedule, true);
798+
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
799+
goto free_and_return;
800+
}
801+
res = NBC_Sched_op ((char *) tmpbuf, false, (char *) recvbuf + roffset, false,
802+
segsizes[relement], datatype, op, schedule, true);
796803
}
797-
798-
res = NBC_Sched_op ((char *) sendbuf + roffset, false, (char *) recvbuf + roffset, false,
799-
segsizes[relement], datatype, op, schedule, true);
800804
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
801-
break;
805+
goto free_and_return;
802806
}
803807
}
804-
805-
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
806-
free (segsizes);
807-
free (segoffsets);
808-
return res;
809-
}
810-
811808
for (int round = p - 1 ; round < 2 * p - 2 ; ++round) {
812809
int selement = (r+1-round + 2*p /*2*p avoids negative mod*/)%p; /* the element I am sending */
813810
int soffset = segoffsets[selement]*ext;
@@ -819,16 +816,14 @@ static inline int allred_sched_ring (int r, int p, int count, MPI_Datatype datat
819816
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
820817
break;
821818
}
822-
823819
res = NBC_Sched_recv ((char *) recvbuf + roffset, false, segsizes[relement], datatype, rpeer,
824820
schedule, true);
825821
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
826822
break;
827823
}
828824
}
829-
825+
free_and_return:
830826
free (segsizes);
831-
free (segoffsets);
832827

833828
return res;
834829
}

0 commit comments

Comments
 (0)