Skip to content

Commit 8040d05

Browse files
committed
osc/rdma: ensure bml add_procs has been called for all local procs
This fixes a bug when ob1 was not selected as the pml but osc/rdma may be selected for an MPI window. In some cases we may use btl/sm. If this is the case we need to ensure btl/sm knows about all the local procs (not just the ones in the communicator). This is required for btl/sm to correctly function at this time. In the future btl/sm should be made more resilient. Fixes #8434 Signed-off-by: Nathan Hjelm <hjelmn@google.com>
1 parent 651e2ed commit 8040d05

File tree

1 file changed

+32
-1
lines changed

1 file changed

+32
-1
lines changed

ompi/mca/osc/rdma/osc_rdma_component.c

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
* Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved.
2222
* Copyright (c) 2019 Research Organization for Information Science
2323
* and Technology (RIST). All rights reserved.
24-
* Copyright (c) 2020 Google, LLC. All rights reserved.
24+
* Copyright (c) 2020-2021 Google, LLC. All rights reserved.
2525
* $COPYRIGHT$
2626
*
2727
* Additional copyrights may follow
@@ -815,6 +815,34 @@ static int ompi_osc_rdma_query_mtls (void)
815815
return -1;
816816
}
817817

818+
/**
819+
* @brief ensure that all local procs are added to the bml
820+
*
821+
* The sm btl requires that all local procs be added to work correctly. If pml/ob1
822+
* was not selected then we can't rely on this property. Since osc/rdma may use
823+
* btl/sm we need to ensure that btl/sm is set up correctly. This function will
824+
* only (potentially) call add_procs on local procs.
825+
*/
826+
static void ompi_osc_rdma_ensure_local_add_procs (void)
827+
{
828+
size_t nprocs;
829+
ompi_proc_t** procs = ompi_proc_get_allocated (&nprocs);
830+
if (NULL == procs) {
831+
/* weird, this should have caused MPI_Init to fail */
832+
return;
833+
}
834+
835+
for (size_t proc_index = 0 ; proc_index < nprocs ; ++proc_index) {
836+
ompi_proc_t *proc = procs[proc_index];
837+
if (OPAL_PROC_ON_LOCAL_NODE(proc->super.proc_flags)) {
838+
/* this will cause add_proc to get called if it has not already been called */
839+
(void) mca_bml_base_get_endpoint (proc);
840+
}
841+
}
842+
843+
free(procs);
844+
}
845+
818846
static int ompi_osc_rdma_query_btls (ompi_communicator_t *comm, struct mca_btl_base_module_t **btl)
819847
{
820848
struct mca_btl_base_module_t **possible_btls = NULL;
@@ -859,6 +887,9 @@ static int ompi_osc_rdma_query_btls (ompi_communicator_t *comm, struct mca_btl_b
859887
return OMPI_SUCCESS;
860888
}
861889

890+
/* if osc/rdma gets selected we need to ensure that all local procs have been added */
891+
ompi_osc_rdma_ensure_local_add_procs ();
892+
862893
for (int rank = 0 ; rank < comm_size ; ++rank) {
863894
ompi_proc_t *proc = ompi_comm_peer_lookup (comm, rank);
864895
mca_bml_base_endpoint_t *endpoint;

0 commit comments

Comments
 (0)