Skip to content

Commit fdaf38c

Browse files
committed
ompi/communicator: do not use MPI_IN_PLACE for iallreduce callback
This bug was revealed in MTT communicator tests #12245 The root cause is that the iallreduce callback is called on the parent communicator, which can be an inter-communicator. In this case, the fix should be using a dedicated sendbuf for the callback. Signed-off-by: Wenduo Wang <wenduwan@amazon.com>
1 parent c3b6852 commit fdaf38c

File tree

1 file changed

+10
-4
lines changed

1 file changed

+10
-4
lines changed

ompi/communicator/comm_cid.c

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,7 @@ struct ompi_comm_cid_context_t {
9898
int remote_leader;
9999
int iter;
100100
/** storage for activate barrier */
101+
int local_peers;
101102
int max_local_peers;
102103
char *port_string;
103104
bool send_first;
@@ -266,7 +267,8 @@ static ompi_comm_cid_context_t *mca_comm_cid_context_alloc (ompi_communicator_t
266267

267268
context->send_first = send_first;
268269
context->iter = 0;
269-
context->max_local_peers = ompi_group_count_local_peers(newcomm->c_local_group);
270+
context->local_peers = ompi_group_count_local_peers(newcomm->c_local_group);
271+
context->max_local_peers = -1;
270272

271273
return context;
272274
}
@@ -908,8 +910,12 @@ int ompi_comm_activate_nb (ompi_communicator_t **newcomm, ompi_communicator_t *c
908910
}
909911

910912
if (OMPI_COMM_IS_INTRA(*newcomm)) {
911-
/* The communicator's disjointness is inferred from max_local_peers. */
912-
ret = context->iallreduce_fn (MPI_IN_PLACE, &context->max_local_peers, 1, MPI_MAX, context,
913+
/**
914+
* The communicator's disjointness is inferred from max_local_peers.
915+
* Note: MPI_IN_PLACE cannot be used here because the parent could be an
916+
* inter-communicator
917+
*/
918+
ret = context->iallreduce_fn (&context->local_peers, &context->max_local_peers, 1, MPI_MAX, context,
913919
&subreq);
914920
if (OMPI_SUCCESS != ret) {
915921
ompi_comm_request_return (request);
@@ -919,7 +925,7 @@ int ompi_comm_activate_nb (ompi_communicator_t **newcomm, ompi_communicator_t *c
919925
} else {
920926
ompi_comm_request_schedule_append (request, ompi_comm_activate_nb_complete, NULL, 0);
921927
}
922-
928+
923929
ompi_comm_request_start (request);
924930

925931
*req = &request->super;

0 commit comments

Comments
 (0)