Skip to content

Commit 6c0305e

Browse files
committed
libnbc: int overflow "offset = i * count * extent"
This bug was hit inside an MPI_Iallgather() test using 4 ranks each contributing 800,000,000 MPI_LONG_LONG where the overflow is near the top of nbc_allgather_init(). The code was ```rbuf = (char *) recvbuf + rank * recvcount * rcvext;``` where rank and recvcount are ints, and rcvext is an MPI_Aint. But the left part of the arithmetic happens first and overflows before it gets to the Aint promotion. So I changed this to ```ruf = (char *) recvbuf + (MPI_Aint) rcvext * rank * recvcount;``` The philosopy for styling the change this particular way is 1. readability: putting an explicit cast at the start of a multiplication makes it obvious at a glance that adequate int-sizing is in place to avoid overflow 2. performance: reordering it such that the first typecast is extraneous (the extents are already MPI_Aint) avoids any potential performance penalty vs for example if we put the typecast on "rank". It would also work to just use "rcvext*rank*recvcount" since rcvext is already MPI_Aint, but I like the explicit typecast as it guarantees no overflow without having to double check each spot to make sure the extent variable really is an MPI_Aint as one would expect it to be. I'm not 100% convinced that performance penalty is real, but it was a concern raised by bosilca and I can't say for sure it's not real. I made the same change a handful of places, although I only have a testcase specifically hitting the overflow that was at the top of nbc_allgather_init(). I also took the liberty of changing "int ext" in red_sched_chain() to "MPI_Aint ext". I can't see any reason for it to just be int. Signed-off-by: Mark Allen <markalle@us.ibm.com>
1 parent fa08eac commit 6c0305e

File tree

9 files changed

+25
-25
lines changed

9 files changed

+25
-25
lines changed

ompi/mca/coll/libnbc/nbc_iallgather.c

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ static int nbc_allgather_init(const void* sendbuf, int sendcount, MPI_Datatype s
9292
sendcount = recvcount;
9393
} else if (!persistent) { /* for persistent, the copy must be scheduled */
9494
/* copy my data to receive buffer */
95-
rbuf = (char *) recvbuf + rank * recvcount * rcvext;
95+
rbuf = (char *) recvbuf + (MPI_Aint)rcvext * rank * recvcount;
9696
res = NBC_Copy (sendbuf, sendcount, sendtype, rbuf, recvcount, recvtype, comm);
9797
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
9898
return res;
@@ -121,7 +121,7 @@ static int nbc_allgather_init(const void* sendbuf, int sendcount, MPI_Datatype s
121121
if (persistent && !inplace) {
122122
/* for nonblocking, data has been copied already */
123123
/* copy my data to receive buffer (= send buffer of NBC_Sched_send) */
124-
rbuf = (char *)recvbuf + rank * recvcount * rcvext;
124+
rbuf = (char *)recvbuf + (MPI_Aint) rcvext * rank * recvcount;
125125
res = NBC_Sched_copy((void *)sendbuf, false, sendcount, sendtype,
126126
rbuf, false, recvcount, recvtype, schedule, true);
127127
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
@@ -237,7 +237,7 @@ static int nbc_allgather_inter_init(const void* sendbuf, int sendcount, MPI_Data
237237
/* do rsize - 1 rounds */
238238
for (int r = 0 ; r < rsize ; ++r) {
239239
/* recv from rank r */
240-
rbuf = (char *) recvbuf + r * recvcount * rcvext;
240+
rbuf = (char *) recvbuf + (MPI_Aint) rcvext * r * recvcount;
241241
res = NBC_Sched_recv (rbuf, false, recvcount, recvtype, r, schedule, false);
242242
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
243243
OBJ_RELEASE(schedule);
@@ -303,12 +303,12 @@ static inline int allgather_sched_linear(
303303
ptrdiff_t rlb, rext;
304304

305305
res = ompi_datatype_get_extent(rdtype, &rlb, &rext);
306-
char *sbuf = (char *)recvbuf + rank * rcount * rext;
306+
char *sbuf = (char *)recvbuf + (MPI_Aint) rext * rank * rcount;
307307

308308
for (int remote = 0; remote < comm_size ; ++remote) {
309309
if (remote != rank) {
310310
/* Recv from rank remote */
311-
char *rbuf = (char *)recvbuf + remote * rcount * rext;
311+
char *rbuf = (char *)recvbuf + (MPI_Aint) rext * remote * rcount;
312312
res = NBC_Sched_recv(rbuf, false, rcount, rdtype, remote, schedule, false);
313313
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; }
314314

ompi/mca/coll/libnbc/nbc_ialltoall.c

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -331,14 +331,14 @@ static int nbc_alltoall_inter_init (const void* sendbuf, int sendcount, MPI_Data
331331

332332
for (int i = 0; i < rsize; i++) {
333333
/* post all sends */
334-
sbuf = (char *) sendbuf + i * sendcount * sndext;
334+
sbuf = (char *) sendbuf + (MPI_Aint) sndext * i * sendcount;
335335
res = NBC_Sched_send (sbuf, false, sendcount, sendtype, i, schedule, false);
336336
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
337337
break;
338338
}
339339

340340
/* post all receives */
341-
rbuf = (char *) recvbuf + i * recvcount * rcvext;
341+
rbuf = (char *) recvbuf + (MPI_Aint) rcvext * i * recvcount;
342342
res = NBC_Sched_recv (rbuf, false, recvcount, recvtype, i, schedule, false);
343343
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
344344
break;
@@ -397,13 +397,13 @@ static inline int a2a_sched_pairwise(int rank, int p, MPI_Aint sndext, MPI_Aint
397397
int sndpeer = (rank + r) % p;
398398
int rcvpeer = (rank - r + p) % p;
399399

400-
char *rbuf = (char *) recvbuf + rcvpeer * recvcount * rcvext;
400+
char *rbuf = (char *) recvbuf + (MPI_Aint) rcvext * rcvpeer * recvcount;
401401
res = NBC_Sched_recv (rbuf, false, recvcount, recvtype, rcvpeer, schedule, false);
402402
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
403403
return res;
404404
}
405405

406-
char *sbuf = (char *) sendbuf + sndpeer * sendcount * sndext;
406+
char *sbuf = (char *) sendbuf + (MPI_Aint) sndext * sndpeer * sendcount;
407407
res = NBC_Sched_send (sbuf, false, sendcount, sendtype, sndpeer, schedule, true);
408408
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
409409
return res;
@@ -523,7 +523,7 @@ static inline int a2a_sched_diss(int rank, int p, MPI_Aint sndext, MPI_Aint rcve
523523

524524
/* phase 3 - reorder - data is now in wrong order in tmpbuf - reorder it into recvbuf */
525525
for (int i = 0 ; i < p; ++i) {
526-
rbuf = (char *) recvbuf + ((rank - i + p) % p) * recvcount * rcvext;
526+
rbuf = (char *) recvbuf + (MPI_Aint) rcvext * ((rank - i + p) % p) * recvcount;
527527
res = NBC_Sched_unpack ((void *)(intptr_t) (i * datasize), true, recvcount, recvtype, rbuf, false, schedule,
528528
false);
529529
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {

ompi/mca/coll/libnbc/nbc_ibcast.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -327,11 +327,11 @@ static inline int bcast_sched_chain(int rank, int p, int root, NBC_Schedule *sch
327327
fragcount = count/numfrag;
328328

329329
for (int fragnum = 0 ; fragnum < numfrag ; ++fragnum) {
330-
buf = (char *) buffer + fragnum * fragcount * ext;
330+
buf = (char *) buffer + (MPI_Aint)ext * fragnum * fragcount;
331331
thiscount = fragcount;
332332
if (fragnum == numfrag-1) {
333333
/* last fragment may not be full */
334-
thiscount = count - fragcount * fragnum;
334+
thiscount = count - (size_t)fragcount * fragnum;
335335
}
336336

337337
/* root does not receive */

ompi/mca/coll/libnbc/nbc_igather.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ static int nbc_gather_init(const void* sendbuf, int sendcount, MPI_Datatype send
103103
}
104104
} else {
105105
for (int i = 0 ; i < p ; ++i) {
106-
rbuf = (char *)recvbuf + i * recvcount * rcvext;
106+
rbuf = (char *)recvbuf + (MPI_Aint) rcvext * i * recvcount;
107107
if (i == root) {
108108
if (!inplace) {
109109
/* if I am the root - just copy the message */
@@ -228,7 +228,7 @@ static int nbc_gather_inter_init (const void* sendbuf, int sendcount, MPI_Dataty
228228
}
229229
} else if (MPI_ROOT == root) {
230230
for (int i = 0 ; i < rsize ; ++i) {
231-
rbuf = ((char *)recvbuf) + (i * recvcount * rcvext);
231+
rbuf = ((char *)recvbuf) + ((MPI_Aint) rcvext * i * recvcount);
232232
/* root receives message to the right buffer */
233233
res = NBC_Sched_recv (rbuf, false, recvcount, recvtype, i, schedule, false);
234234
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {

ompi/mca/coll/libnbc/nbc_ineighbor_allgather.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ static int nbc_neighbor_allgather_init(const void *sbuf, int scount, MPI_Datatyp
8686

8787
for (int i = 0 ; i < indegree ; ++i) {
8888
if (MPI_PROC_NULL != srcs[i]) {
89-
res = NBC_Sched_recv ((char *) rbuf + i * rcount * rcvext, true, rcount, rtype, srcs[i], schedule, false);
89+
res = NBC_Sched_recv ((char *) rbuf + (MPI_Aint) rcvext * i * rcount, true, rcount, rtype, srcs[i], schedule, false);
9090
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
9191
break;
9292
}

ompi/mca/coll/libnbc/nbc_ineighbor_alltoall.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ static int nbc_neighbor_alltoall_init(const void *sbuf, int scount, MPI_Datatype
8989

9090
for (int i = 0 ; i < indegree ; ++i) {
9191
if (MPI_PROC_NULL != srcs[i]) {
92-
res = NBC_Sched_recv ((char *) rbuf + i * rcount * rcvext, true, rcount, rtype, srcs[i], schedule, false);
92+
res = NBC_Sched_recv ((char *) rbuf + (MPI_Aint) rcvext * i * rcount, true, rcount, rtype, srcs[i], schedule, false);
9393
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
9494
break;
9595
}
@@ -106,7 +106,7 @@ static int nbc_neighbor_alltoall_init(const void *sbuf, int scount, MPI_Datatype
106106

107107
for (int i = 0 ; i < outdegree ; ++i) {
108108
if (MPI_PROC_NULL != dsts[i]) {
109-
res = NBC_Sched_send ((char *) sbuf + i * scount * sndext, false, scount, stype, dsts[i], schedule, false);
109+
res = NBC_Sched_send ((char *) sbuf + (MPI_Aint) sndext * i * scount, false, scount, stype, dsts[i], schedule, false);
110110
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
111111
break;
112112
}

ompi/mca/coll/libnbc/nbc_ireduce.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
static inline int red_sched_binomial (int rank, int p, int root, const void *sendbuf, void *redbuf, char tmpredbuf, int count, MPI_Datatype datatype,
3030
MPI_Op op, char inplace, NBC_Schedule *schedule, void *tmpbuf);
3131
static inline int red_sched_chain (int rank, int p, int root, const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
32-
MPI_Op op, int ext, size_t size, NBC_Schedule *schedule, void *tmpbuf, int fragsize);
32+
MPI_Op op, MPI_Aint ext, size_t size, NBC_Schedule *schedule, void *tmpbuf, int fragsize);
3333

3434
static inline int red_sched_linear (int rank, int rsize, int root, const void *sendbuf, void *recvbuf, void *tmpbuf, int count, MPI_Datatype datatype,
3535
MPI_Op op, NBC_Schedule *schedule);
@@ -459,7 +459,7 @@ static inline int red_sched_binomial (int rank, int p, int root, const void *sen
459459

460460
/* chain send ... */
461461
static inline int red_sched_chain (int rank, int p, int root, const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
462-
MPI_Op op, int ext, size_t size, NBC_Schedule *schedule, void *tmpbuf, int fragsize) {
462+
MPI_Op op, MPI_Aint ext, size_t size, NBC_Schedule *schedule, void *tmpbuf, int fragsize) {
463463
int res, vrank, rpeer, speer, numfrag, fragcount, thiscount;
464464
long offset;
465465

@@ -479,11 +479,11 @@ static inline int red_sched_chain (int rank, int p, int root, const void *sendbu
479479
fragcount = count / numfrag;
480480

481481
for (int fragnum = 0 ; fragnum < numfrag ; ++fragnum) {
482-
offset = fragnum * fragcount * ext;
482+
offset = (MPI_Aint) ext * fragnum * fragcount;
483483
thiscount = fragcount;
484484
if(fragnum == numfrag - 1) {
485485
/* last fragment may not be full */
486-
thiscount = count - fragcount * fragnum;
486+
thiscount = count - (size_t)fragcount * fragnum;
487487
}
488488

489489
/* last node does not recv */

ompi/mca/coll/libnbc/nbc_ireduce_scatter_block.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ static int nbc_reduce_scatter_block_init(const void* sendbuf, void* recvbuf, int
7070

7171
maxr = ceil_of_log2(p);
7272

73-
count = p * recvcount;
73+
count = (size_t) p * recvcount;
7474

7575
if (0 < count) {
7676
char *rbuf, *lbuf, *buf;
@@ -248,7 +248,7 @@ static int nbc_reduce_scatter_block_inter_init(const void *sendbuf, void *recvbu
248248
return res;
249249
}
250250

251-
count = rcount * lsize;
251+
count = (size_t)rcount * lsize;
252252

253253
span = opal_datatype_span(&dtype->super, count, &gap);
254254
span_align = OPAL_ALIGN(span, dtype->super.align, ptrdiff_t);

ompi/mca/coll/libnbc/nbc_iscatter.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ static int nbc_scatter_init (const void* sendbuf, int sendcount, MPI_Datatype se
9999
}
100100
} else {
101101
for (int i = 0 ; i < p ; ++i) {
102-
sbuf = (char *) sendbuf + i * sendcount * sndext;
102+
sbuf = (char *) sendbuf + (MPI_Aint) sndext * i * sendcount;
103103
if (i == root) {
104104
if (!inplace) {
105105
/* if I am the root - just copy the message */
@@ -222,7 +222,7 @@ static int nbc_scatter_inter_init (const void* sendbuf, int sendcount, MPI_Datat
222222
}
223223
} else if (MPI_ROOT == root) {
224224
for (int i = 0 ; i < rsize ; ++i) {
225-
sbuf = ((char *)sendbuf) + (i * sendcount * sndext);
225+
sbuf = ((char *)sendbuf) + ((MPI_Aint) sndext * i * sendcount);
226226
/* root sends the right buffer to the right receiver */
227227
res = NBC_Sched_send(sbuf, false, sendcount, sendtype, i, schedule, false);
228228
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {

0 commit comments

Comments
 (0)