Skip to content

Commit 7d1711f

Browse files
authored
Merge pull request #9616 from jjhursey/libnbc-fix-overflow
libnbc: Fix int overflow when handling count parameters
2 parents 18d80f5 + 6075048 commit 7d1711f

File tree

5 files changed

+67
-28
lines changed

5 files changed

+67
-28
lines changed

ompi/mca/coll/libnbc/nbc.c

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
* Author(s): Torsten Hoefler <htor@cs.indiana.edu>
1717
*
1818
* Copyright (c) 2012 Oracle and/or its affiliates. All rights reserved.
19-
* Copyright (c) 2016 IBM Corporation. All rights reserved.
19+
* Copyright (c) 2016-2021 IBM Corporation. All rights reserved.
2020
* Copyright (c) 2017 Ian Bradley Morgan and Anthony Skjellum. All
2121
* rights reserved.
2222
* Copyright (c) 2018 FUJITSU LIMITED. All rights reserved.
@@ -119,7 +119,7 @@ static int nbc_schedule_round_append (NBC_Schedule *schedule, void *data, int da
119119
}
120120

121121
/* this function puts a send into the schedule */
122-
static int NBC_Sched_send_internal (const void* buf, char tmpbuf, int count, MPI_Datatype datatype, int dest, bool local, NBC_Schedule *schedule, bool barrier) {
122+
static int NBC_Sched_send_internal (const void* buf, char tmpbuf, size_t count, MPI_Datatype datatype, int dest, bool local, NBC_Schedule *schedule, bool barrier) {
123123
NBC_Args_send send_args;
124124
int ret;
125125

@@ -143,16 +143,16 @@ static int NBC_Sched_send_internal (const void* buf, char tmpbuf, int count, MPI
143143
return OMPI_SUCCESS;
144144
}
145145

146-
int NBC_Sched_send (const void* buf, char tmpbuf, int count, MPI_Datatype datatype, int dest, NBC_Schedule *schedule, bool barrier) {
146+
int NBC_Sched_send (const void* buf, char tmpbuf, size_t count, MPI_Datatype datatype, int dest, NBC_Schedule *schedule, bool barrier) {
147147
return NBC_Sched_send_internal (buf, tmpbuf, count, datatype, dest, false, schedule, barrier);
148148
}
149149

150-
int NBC_Sched_local_send (const void* buf, char tmpbuf, int count, MPI_Datatype datatype, int dest, NBC_Schedule *schedule, bool barrier) {
150+
int NBC_Sched_local_send (const void* buf, char tmpbuf, size_t count, MPI_Datatype datatype, int dest, NBC_Schedule *schedule, bool barrier) {
151151
return NBC_Sched_send_internal (buf, tmpbuf, count, datatype, dest, true, schedule, barrier);
152152
}
153153

154154
/* this function puts a receive into the schedule */
155-
static int NBC_Sched_recv_internal (void* buf, char tmpbuf, int count, MPI_Datatype datatype, int source, bool local, NBC_Schedule *schedule, bool barrier) {
155+
static int NBC_Sched_recv_internal (void* buf, char tmpbuf, size_t count, MPI_Datatype datatype, int source, bool local, NBC_Schedule *schedule, bool barrier) {
156156
NBC_Args_recv recv_args;
157157
int ret;
158158

@@ -176,16 +176,16 @@ static int NBC_Sched_recv_internal (void* buf, char tmpbuf, int count, MPI_Datat
176176
return OMPI_SUCCESS;
177177
}
178178

179-
int NBC_Sched_recv (void* buf, char tmpbuf, int count, MPI_Datatype datatype, int source, NBC_Schedule *schedule, bool barrier) {
179+
int NBC_Sched_recv (void* buf, char tmpbuf, size_t count, MPI_Datatype datatype, int source, NBC_Schedule *schedule, bool barrier) {
180180
return NBC_Sched_recv_internal(buf, tmpbuf, count, datatype, source, false, schedule, barrier);
181181
}
182182

183-
int NBC_Sched_local_recv (void* buf, char tmpbuf, int count, MPI_Datatype datatype, int source, NBC_Schedule *schedule, bool barrier) {
183+
int NBC_Sched_local_recv (void* buf, char tmpbuf, size_t count, MPI_Datatype datatype, int source, NBC_Schedule *schedule, bool barrier) {
184184
return NBC_Sched_recv_internal(buf, tmpbuf, count, datatype, source, true, schedule, barrier);
185185
}
186186

187187
/* this function puts an operation into the schedule */
188-
int NBC_Sched_op (const void* buf1, char tmpbuf1, void* buf2, char tmpbuf2, int count, MPI_Datatype datatype,
188+
int NBC_Sched_op (const void* buf1, char tmpbuf1, void* buf2, char tmpbuf2, size_t count, MPI_Datatype datatype,
189189
MPI_Op op, NBC_Schedule *schedule, bool barrier) {
190190
NBC_Args_op op_args;
191191
int ret;
@@ -212,7 +212,8 @@ int NBC_Sched_op (const void* buf1, char tmpbuf1, void* buf2, char tmpbuf2, int
212212
}
213213

214214
/* this function puts a copy into the schedule */
215-
int NBC_Sched_copy (void *src, char tmpsrc, int srccount, MPI_Datatype srctype, void *tgt, char tmptgt, int tgtcount,
215+
int NBC_Sched_copy (void *src, char tmpsrc, size_t srccount, MPI_Datatype srctype,
216+
void *tgt, char tmptgt, size_t tgtcount,
216217
MPI_Datatype tgttype, NBC_Schedule *schedule, bool barrier) {
217218
NBC_Args_copy copy_args;
218219
int ret;
@@ -240,7 +241,7 @@ int NBC_Sched_copy (void *src, char tmpsrc, int srccount, MPI_Datatype srctype,
240241
}
241242

242243
/* this function puts a unpack into the schedule */
243-
int NBC_Sched_unpack (void *inbuf, char tmpinbuf, int count, MPI_Datatype datatype, void *outbuf, char tmpoutbuf,
244+
int NBC_Sched_unpack (void *inbuf, char tmpinbuf, size_t count, MPI_Datatype datatype, void *outbuf, char tmpoutbuf,
244245
NBC_Schedule *schedule, bool barrier) {
245246
NBC_Args_unpack unpack_args;
246247
int ret;
@@ -534,6 +535,7 @@ static inline int NBC_Start_round(NBC_Handle *handle) {
534535
} else {
535536
buf2=opargs.buf2;
536537
}
538+
537539
ompi_op_reduce(opargs.op, buf1, buf2, opargs.count, opargs.datatype);
538540
break;
539541
case COPY:

ompi/mca/coll/libnbc/nbc_internal.h

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
1616
* reserved.
1717
* Copyright (c) 2018 FUJITSU LIMITED. All rights reserved.
18+
* Copyright (c) 2021 IBM Corporation. All rights reserved.
1819
* $COPYRIGHT$
1920
*
2021
* Additional copyrights may follow
@@ -90,7 +91,7 @@ typedef enum {
9091
/* the send argument struct */
9192
typedef struct {
9293
NBC_Fn_type type;
93-
int count;
94+
size_t count;
9495
const void *buf;
9596
MPI_Datatype datatype;
9697
int dest;
@@ -101,7 +102,7 @@ typedef struct {
101102
/* the receive argument struct */
102103
typedef struct {
103104
NBC_Fn_type type;
104-
int count;
105+
size_t count;
105106
void *buf;
106107
MPI_Datatype datatype;
107108
char tmpbuf;
@@ -118,26 +119,26 @@ typedef struct {
118119
void *buf2;
119120
MPI_Op op;
120121
MPI_Datatype datatype;
121-
int count;
122+
size_t count;
122123
} NBC_Args_op;
123124

124125
/* the copy argument struct */
125126
typedef struct {
126127
NBC_Fn_type type;
127-
int srccount;
128+
size_t srccount;
128129
void *src;
129130
void *tgt;
130131
MPI_Datatype srctype;
131132
MPI_Datatype tgttype;
132-
int tgtcount;
133+
size_t tgtcount;
133134
char tmpsrc;
134135
char tmptgt;
135136
} NBC_Args_copy;
136137

137138
/* unpack operation arguments */
138139
typedef struct {
139140
NBC_Fn_type type;
140-
int count;
141+
size_t count;
141142
void *inbuf;
142143
void *outbuf;
143144
MPI_Datatype datatype;
@@ -146,15 +147,15 @@ typedef struct {
146147
} NBC_Args_unpack;
147148

148149
/* internal function prototypes */
149-
int NBC_Sched_send (const void* buf, char tmpbuf, int count, MPI_Datatype datatype, int dest, NBC_Schedule *schedule, bool barrier);
150-
int NBC_Sched_local_send (const void* buf, char tmpbuf, int count, MPI_Datatype datatype, int dest,NBC_Schedule *schedule, bool barrier);
151-
int NBC_Sched_recv (void* buf, char tmpbuf, int count, MPI_Datatype datatype, int source, NBC_Schedule *schedule, bool barrier);
152-
int NBC_Sched_local_recv (void* buf, char tmpbuf, int count, MPI_Datatype datatype, int source, NBC_Schedule *schedule, bool barrier);
153-
int NBC_Sched_op (const void* buf1, char tmpbuf1, void* buf2, char tmpbuf2, int count, MPI_Datatype datatype,
150+
int NBC_Sched_send (const void* buf, char tmpbuf, size_t count, MPI_Datatype datatype, int dest, NBC_Schedule *schedule, bool barrier);
151+
int NBC_Sched_local_send (const void* buf, char tmpbuf, size_t count, MPI_Datatype datatype, int dest,NBC_Schedule *schedule, bool barrier);
152+
int NBC_Sched_recv (void* buf, char tmpbuf, size_t count, MPI_Datatype datatype, int source, NBC_Schedule *schedule, bool barrier);
153+
int NBC_Sched_local_recv (void* buf, char tmpbuf, size_t count, MPI_Datatype datatype, int source, NBC_Schedule *schedule, bool barrier);
154+
int NBC_Sched_op (const void* buf1, char tmpbuf1, void* buf2, char tmpbuf2, size_t count, MPI_Datatype datatype,
154155
MPI_Op op, NBC_Schedule *schedule, bool barrier);
155-
int NBC_Sched_copy (void *src, char tmpsrc, int srccount, MPI_Datatype srctype, void *tgt, char tmptgt, int tgtcount,
156+
int NBC_Sched_copy (void *src, char tmpsrc, size_t srccount, MPI_Datatype srctype, void *tgt, char tmptgt, size_t tgtcount,
156157
MPI_Datatype tgttype, NBC_Schedule *schedule, bool barrier);
157-
int NBC_Sched_unpack (void *inbuf, char tmpinbuf, int count, MPI_Datatype datatype, void *outbuf, char tmpoutbuf,
158+
int NBC_Sched_unpack (void *inbuf, char tmpinbuf, size_t count, MPI_Datatype datatype, void *outbuf, char tmpoutbuf,
158159
NBC_Schedule *schedule, bool barrier);
159160

160161
int NBC_Sched_barrier (NBC_Schedule *schedule);

ompi/mca/coll/libnbc/nbc_ireduce_scatter.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,8 @@
4545
static int nbc_reduce_scatter_init(const void* sendbuf, void* recvbuf, const int *recvcounts, MPI_Datatype datatype,
4646
MPI_Op op, struct ompi_communicator_t *comm, ompi_request_t ** request,
4747
mca_coll_base_module_t *module, bool persistent) {
48-
int peer, rank, maxr, p, res, count;
48+
int peer, rank, maxr, p, res;
49+
size_t count;
4950
MPI_Aint ext;
5051
ptrdiff_t gap, span, span_align;
5152
char *sbuf, inplace;
@@ -230,7 +231,8 @@ int ompi_coll_libnbc_ireduce_scatter (const void* sendbuf, void* recvbuf, const
230231
static int nbc_reduce_scatter_inter_init (const void* sendbuf, void* recvbuf, const int *recvcounts, MPI_Datatype datatype,
231232
MPI_Op op, struct ompi_communicator_t *comm, ompi_request_t ** request,
232233
mca_coll_base_module_t *module, bool persistent) {
233-
int rank, res, count, lsize, rsize;
234+
int rank, res, lsize, rsize;
235+
size_t count;
234236
MPI_Aint ext;
235237
ptrdiff_t gap, span, span_align;
236238
NBC_Schedule *schedule;

ompi/mca/coll/libnbc/nbc_ireduce_scatter_block.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,8 @@
4343
static int nbc_reduce_scatter_block_init(const void* sendbuf, void* recvbuf, int recvcount, MPI_Datatype datatype,
4444
MPI_Op op, struct ompi_communicator_t *comm, ompi_request_t ** request,
4545
mca_coll_base_module_t *module, bool persistent) {
46-
int peer, rank, maxr, p, res, count;
46+
int peer, rank, maxr, p, res;
47+
size_t count;
4748
MPI_Aint ext;
4849
ptrdiff_t gap, span;
4950
char *redbuf, *sbuf, inplace;
@@ -229,7 +230,8 @@ int ompi_coll_libnbc_ireduce_scatter_block(const void* sendbuf, void* recvbuf, i
229230
static int nbc_reduce_scatter_block_inter_init(const void *sendbuf, void *recvbuf, int rcount, struct ompi_datatype_t *dtype,
230231
struct ompi_op_t *op, struct ompi_communicator_t *comm, ompi_request_t **request,
231232
mca_coll_base_module_t *module, bool persistent) {
232-
int rank, res, count, lsize, rsize;
233+
int rank, res, lsize, rsize;
234+
size_t count;
233235
MPI_Aint ext;
234236
ptrdiff_t gap, span, span_align;
235237
NBC_Schedule *schedule;

ompi/op/op.h

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
* and Technology (RIST). All rights reserved.
2121
* Copyright (c) 2018 Triad National Security, LLC. All rights
2222
* reserved.
23+
* Copyright (c) 2021 IBM Corporation. All rights reserved.
2324
* $COPYRIGHT$
2425
*
2526
* Additional copyrights may follow
@@ -510,10 +511,41 @@ static inline bool ompi_op_is_valid(ompi_op_t * op, ompi_datatype_t * ddt,
510511
* is not defined to have that operation, it is likely to seg fault.
511512
*/
512513
static inline void ompi_op_reduce(ompi_op_t * op, void *source,
513-
void *target, int count,
514+
void *target, size_t full_count,
514515
ompi_datatype_t * dtype)
515516
{
516517
MPI_Fint f_dtype, f_count;
518+
int count = full_count;
519+
520+
/*
521+
* If the full_count is > INT_MAX then we need to call the reduction op
522+
* in iterations of counts <= INT_MAX since it has an `int *len`
523+
* parameter.
524+
*
525+
* Note: When we add BigCount support then we can distinguish between
526+
* a reduction operation with `int *len` and `MPI_Count *len`. At which
527+
* point we can avoid this loop.
528+
*/
529+
if( OPAL_UNLIKELY(full_count > INT_MAX) ) {
530+
size_t done_count = 0, shift;
531+
int iter_count;
532+
ptrdiff_t ext, lb;
533+
534+
ompi_datatype_get_extent(dtype, &lb, &ext);
535+
536+
while(done_count < full_count) {
537+
if(done_count + INT_MAX > full_count) {
538+
iter_count = full_count - done_count;
539+
} else {
540+
iter_count = INT_MAX;
541+
}
542+
shift = done_count * ext;
543+
// Recurse one level in iterations of 'int'
544+
ompi_op_reduce(op, (char*)source + shift, (char*)target + shift, iter_count, dtype);
545+
done_count += iter_count;
546+
}
547+
return;
548+
}
517549

518550
/*
519551
* Call the reduction function. Two dimensions: a) if both the op

0 commit comments

Comments
 (0)