Skip to content

Commit 4215325

Browse files
committed
osc: Fix rdma component when not using ob1
When the ob1 PML was not eligible for selection (such as when the user sets --mca pml cm), the BML and BTL frameworks are not initialized and the rdma osc component will later fail as there are no BTLs available. This patch resolves the issue by having the rdma osc component initialize the BML interface. Making this change required two additional, related changes. First, since the BTLs use the modex, the rdma initialization must be moved before the modex point, so that putting data in the modex works as expected. Second, BTLs can require loading the entire world during init (such as TCP when there are multiple threads and multiple NICs or usnic), so we extend the world loading checks to include OSC. Since the other Portals4 components say that they do require world loading, we also assume the Portals4 osc component also requires world loading. Signed-off-by: Brian Barrett <bbarrett@amazon.com>
1 parent 4038fd6 commit 4215325

File tree

5 files changed

+41
-6
lines changed

5 files changed

+41
-6
lines changed

ompi/instance/instance.c

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -536,6 +536,10 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
536536
return ompi_instance_print_error ("mca_pml_base_select() failed", ret);
537537
}
538538

539+
if (OMPI_SUCCESS != (ret = ompi_osc_base_find_available (OPAL_ENABLE_PROGRESS_THREADS, ompi_mpi_thread_multiple))) {
540+
return ompi_instance_print_error ("ompi_osc_base_find_available() failed", ret);
541+
}
542+
539543
OMPI_TIMING_IMPORT_OPAL("orte_init");
540544
OMPI_TIMING_NEXT("rte_init-commit");
541545

@@ -617,10 +621,6 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
617621
return ompi_instance_print_error ("mca_coll_base_find_available() failed", ret);
618622
}
619623

620-
if (OMPI_SUCCESS != (ret = ompi_osc_base_find_available (OPAL_ENABLE_PROGRESS_THREADS, ompi_mpi_thread_multiple))) {
621-
return ompi_instance_print_error ("ompi_osc_base_find_available() failed", ret);
622-
}
623-
624624
/* io and topo components are not selected here -- see comment
625625
above about the io and topo frameworks being loaded lazily */
626626

@@ -654,7 +654,8 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
654654
return ompi_instance_print_error ("ompi_attr_create_predefined_keyvals() failed", ret);
655655
}
656656

657-
if (mca_pml_base_requires_world ()) {
657+
if (mca_pml_base_requires_world() ||
658+
mca_osc_base_requires_world()) {
658659
/* need to set up comm world for this instance -- XXX -- FIXME -- probably won't always
659660
* be the case. */
660661
if (OMPI_SUCCESS != (ret = ompi_comm_init_mpi3 ())) {
@@ -699,7 +700,8 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
699700
/* some btls/mtls require we call add_procs with all procs in the job.
700701
* since the btls/mtls have no visibility here it is up to the pml to
701702
* convey this requirement */
702-
if (mca_pml_base_requires_world ()) {
703+
if (mca_pml_base_requires_world() ||
704+
mca_osc_base_requires_world()) {
703705
if (NULL == (procs = ompi_proc_world (&nprocs))) {
704706
return ompi_instance_print_error ("ompi_proc_get_allocated () failed", ret);
705707
}

ompi/mca/osc/base/osc_base_init.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@
3030
#include "ompi/communicator/communicator.h"
3131
#include "ompi/win/win.h"
3232

33+
bool ompi_osc_base_requires_world = false;
34+
3335
int
3436
ompi_osc_base_select(ompi_win_t *win,
3537
void **base,

ompi/mca/osc/osc.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,9 @@ struct ompi_datatype_t;
5353
struct ompi_op_t;
5454
struct ompi_request_t;
5555

56+
57+
extern bool ompi_osc_base_requires_world;
58+
5659
/* ******************************************************************** */
5760

5861

@@ -419,6 +422,11 @@ typedef ompi_osc_base_module_4_0_0_t ompi_osc_base_module_t;
419422

420423
/* ******************************************************************** */
421424

425+
static inline bool mca_osc_base_requires_world (void)
426+
{
427+
return ompi_osc_base_requires_world;
428+
}
429+
422430

423431
END_C_DECLS
424432

ompi/mca/osc/portals4/osc_portals4_component.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -349,6 +349,8 @@ component_init(bool enable_progress_threads, bool enable_mpi_threads)
349349
return ret;
350350
}
351351

352+
ompi_osc_base_requires_world = true;
353+
352354
return OMPI_SUCCESS;
353355
}
354356

ompi/mca/osc/rdma/osc_rdma_component.c

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -345,6 +345,27 @@ static int ompi_osc_rdma_component_init (bool enable_progress_threads,
345345
__FILE__, __LINE__, ret);
346346
}
347347

348+
ret = mca_bml_base_init(enable_progress_threads, enable_mpi_threads);
349+
if (OPAL_SUCCESS != ret) {
350+
opal_output_verbose(1, ompi_osc_base_framework.framework_output,
351+
"%s:%d: bml_base_init() failed: %d",
352+
__FILE__, __LINE__, ret);
353+
return ret;
354+
}
355+
356+
/* check if any btls do not support dynamic add_procs */
357+
mca_btl_base_selected_module_t* selected_btl;
358+
OPAL_LIST_FOREACH(selected_btl, &mca_btl_base_modules_initialized,
359+
mca_btl_base_selected_module_t) {
360+
mca_btl_base_module_t *btl = selected_btl->btl_module;
361+
362+
if (btl->btl_flags & MCA_BTL_FLAGS_SINGLE_ADD_PROCS) {
363+
ompi_osc_base_requires_world = true;
364+
break;
365+
}
366+
367+
}
368+
348369
return ret;
349370
}
350371

0 commit comments

Comments
 (0)