Skip to content

Commit de71f5a

Browse files
author
Luke Robison
committed
coll/han: disable alltoall for device buffers and MPI_IN_PLACE
The han alltoall algorithm was never intended for an in-place alltoall and will crash if used in that configuration. Additionally it is not an efficient choice for device buffers. Fall back to another algorithm if either condition is met. Signed-off-by: Luke Robison <lrbison@amazon.com>
1 parent 08ac1be commit de71f5a

File tree

1 file changed

+45
-20
lines changed

1 file changed

+45
-20
lines changed

ompi/mca/coll/han/coll_han_alltoall.c

Lines changed: 45 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,10 @@ int mca_coll_han_alltoall_using_smsc(
6969
{
7070

7171
mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module;
72+
opal_convertor_t convertor;
73+
int send_needs_bounce, have_device_buffer;
74+
size_t packed_size = 0;
75+
7276

7377
OPAL_OUTPUT_VERBOSE((90, mca_coll_han_component.han_output,
7478
"Entering mca_coll_han_alltoall_using_smsc\n"));
@@ -82,6 +86,44 @@ int mca_coll_han_alltoall_using_smsc(
8286
comm, han_module->previous_alltoall_module);
8387
}
8488

89+
if (sbuf == MPI_IN_PLACE) {
90+
/* This is not an in-place algorithm */
91+
return han_module->previous_alltoall(sbuf, scount, sdtype, rbuf, rcount, rdtype,
92+
comm, han_module->previous_alltoall_module);
93+
}
94+
95+
OBJ_CONSTRUCT( &convertor, opal_convertor_t );
96+
send_needs_bounce = 0;
97+
have_device_buffer = 0;
98+
/* get converter for copying to one of the leader ranks, and get packed size: */
99+
opal_convertor_copy_and_prepare_for_send(ompi_mpi_local_convertor, &sdtype->super, scount, sbuf, 0, &convertor);
100+
have_device_buffer |= opal_convertor_on_device(&convertor);
101+
send_needs_bounce |= opal_convertor_need_buffers(&convertor);
102+
opal_convertor_cleanup(&convertor);
103+
104+
opal_convertor_copy_and_prepare_for_recv(ompi_mpi_local_convertor, &rdtype->super, rcount, rbuf, 0, &convertor);
105+
have_device_buffer |= opal_convertor_on_device(&convertor);
106+
send_needs_bounce |= opal_convertor_need_buffers(&convertor);
107+
opal_convertor_get_packed_size( &convertor, &packed_size );
108+
opal_convertor_cleanup(&convertor);
109+
110+
if (have_device_buffer) {
111+
/*
112+
Although this algorithm is functional for device buffers, it requires an
113+
extra copy through the bounce buffer that doesn't make it efficient.
114+
Prefer another algorithm instead.
115+
116+
Note that Open MPI makes assumptions that if one rank uses a device
117+
buffer in a collective, then all ranks will use device buffers, so there
118+
is no need to communicate before taking this branch.
119+
*/
120+
OBJ_DESTRUCT(&convertor);
121+
return han_module->previous_alltoall(sbuf, scount, sdtype, rbuf, rcount, rdtype,
122+
comm, han_module->previous_alltoall_module);
123+
}
124+
125+
126+
85127
/* Create the subcommunicators */
86128
if( OMPI_SUCCESS != mca_coll_han_comm_create_new(comm, han_module) ) {
87129
opal_output_verbose(1, mca_coll_han_component.han_output,
@@ -107,12 +149,11 @@ int mca_coll_han_alltoall_using_smsc(
107149
comm, han_module->previous_alltoall_module);
108150
}
109151

110-
int rc, send_needs_bounce, ii_push_data;
152+
int rc, ii_push_data;
111153
size_t sndsize;
112154
MPI_Aint sextent, rextent, lb;
113-
char *send_bounce;
114-
opal_convertor_t convertor;
115-
size_t packed_size = 0, packed_size_tmp;
155+
char *send_bounce = NULL;
156+
size_t packed_size_tmp;
116157
int use_isend;
117158
void *gather_buf_in[4];
118159
int up_rank;
@@ -140,22 +181,6 @@ int mca_coll_han_alltoall_using_smsc(
140181
}
141182
if (fanout > up_size) { fanout = up_size; }
142183

143-
OBJ_CONSTRUCT( &convertor, opal_convertor_t );
144-
145-
146-
send_needs_bounce = 0;
147-
/* get converter for copying to one of the leader ranks, and get packed size: */
148-
opal_convertor_copy_and_prepare_for_send(ompi_mpi_local_convertor, &sdtype->super, scount, sbuf, 0, &convertor);
149-
send_needs_bounce |= 0 != opal_convertor_on_device(&convertor);
150-
send_needs_bounce |= opal_convertor_need_buffers(&convertor);
151-
opal_convertor_cleanup(&convertor);
152-
153-
opal_convertor_copy_and_prepare_for_recv(ompi_mpi_local_convertor, &rdtype->super, rcount, rbuf, 0, &convertor);
154-
send_needs_bounce |= 0 != opal_convertor_on_device(&convertor);
155-
send_needs_bounce |= opal_convertor_need_buffers(&convertor);
156-
opal_convertor_get_packed_size( &convertor, &packed_size );
157-
opal_convertor_cleanup(&convertor);
158-
159184
/*
160185
Because push-mode needs extra synchronizations, we'd like to avoid it,
161186
however it might be necessary:

0 commit comments

Comments
 (0)