Skip to content

Commit 23df181

Browse files
committed
communicator bugfix: disjoint function does not have the correct max_local_peers value
local_peers is passed in the non-blocking function iallreduce_fn as a stack variable. Change it to be part of the context struct so the correct value is passed. Signed-off-by: Jessie Yang <jiaxiyan@amazon.com>
1 parent f3d0c59 commit 23df181

File tree

1 file changed

+18
-14
lines changed

1 file changed

+18
-14
lines changed

ompi/communicator/comm_cid.c

Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -774,6 +774,11 @@ static int ompi_comm_activate_nb_complete (ompi_comm_request_t *request);
774774
/* Callback function to set communicator disjointness flags */
775775
static inline void ompi_comm_set_disjointness_nb_complete(ompi_comm_cid_context_t *context)
776776
{
777+
/* Only set the disjoint flags when it is intra-communicator */
778+
if (OMPI_COMM_IS_INTER(*context->newcommp)) {
779+
return;
780+
}
781+
777782
if (OMPI_COMM_IS_DISJOINT_SET(*context->newcommp)) {
778783
opal_show_help("help-comm.txt", "disjointness-set-again", true);
779784
return;
@@ -870,7 +875,7 @@ int ompi_comm_activate_nb (ompi_communicator_t **newcomm, ompi_communicator_t *c
870875
ompi_comm_cid_context_t *context;
871876
ompi_comm_request_t *request;
872877
ompi_request_t *subreq;
873-
int ret = 0, local_peers = -1;
878+
int ret = 0;
874879

875880
/* the caller should not pass NULL for comm (it may be the same as *newcomm) */
876881
assert (NULL != comm);
@@ -902,20 +907,19 @@ int ompi_comm_activate_nb (ompi_communicator_t **newcomm, ompi_communicator_t *c
902907
OMPI_COMM_SET_PML_ADDED(*newcomm);
903908
}
904909

905-
/**
906-
* Dual-purpose barrier:
907-
* 1. The communicator's disjointness is inferred from max_local_peers.
908-
* 2. After the operation it is allowed to send messages over the new communicator.
909-
*/
910-
local_peers = context->max_local_peers;
911-
ret = context->iallreduce_fn (&local_peers, &context->max_local_peers, 1, MPI_MAX, context,
912-
&subreq);
913-
if (OMPI_SUCCESS != ret) {
914-
ompi_comm_request_return (request);
915-
return ret;
910+
if (OMPI_COMM_IS_INTRA(*newcomm)) {
911+
/* The communicator's disjointness is inferred from max_local_peers. */
912+
ret = context->iallreduce_fn (MPI_IN_PLACE, &context->max_local_peers, 1, MPI_MAX, context,
913+
&subreq);
914+
if (OMPI_SUCCESS != ret) {
915+
ompi_comm_request_return (request);
916+
return ret;
917+
}
918+
ompi_comm_request_schedule_append (request, ompi_comm_activate_nb_complete, &subreq, 1);
919+
} else {
920+
ompi_comm_request_schedule_append (request, ompi_comm_activate_nb_complete, NULL, 0);
916921
}
917-
918-
ompi_comm_request_schedule_append (request, ompi_comm_activate_nb_complete, &subreq, 1);
922+
919923
ompi_comm_request_start (request);
920924

921925
*req = &request->super;

0 commit comments

Comments
 (0)